summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/Kconfig92
-rw-r--r--lib/Kconfig.debug114
-rw-r--r--lib/Kconfig.kasan12
-rw-r--r--lib/Kconfig.kcsan6
-rw-r--r--lib/Kconfig.kmsan11
-rw-r--r--lib/Makefile42
-rw-r--r--lib/alloc_tag.c75
-rw-r--r--lib/bitmap.c6
-rw-r--r--lib/btree.c4
-rw-r--r--lib/bug.c90
-rw-r--r--lib/clz_ctz.c8
-rw-r--r--lib/codetag.c17
-rw-r--r--lib/crc/.gitignore5
-rw-r--r--lib/crc/Kconfig119
-rw-r--r--lib/crc/Makefile63
-rw-r--r--lib/crc/arm/crc-t10dif-core.S468
-rw-r--r--lib/crc/arm/crc-t10dif.h46
-rw-r--r--lib/crc/arm/crc32-core.S306
-rw-r--r--lib/crc/arm/crc32.h96
-rw-r--r--lib/crc/arm64/crc-t10dif-core.S469
-rw-r--r--lib/crc/arm64/crc-t10dif.h48
-rw-r--r--lib/crc/arm64/crc32-core.S362
-rw-r--r--lib/crc/arm64/crc32.h85
-rw-r--r--lib/crc/crc-ccitt.c (renamed from lib/crc-ccitt.c)8
-rw-r--r--lib/crc/crc-itu-t.c (renamed from lib/crc-itu-t.c)5
-rw-r--r--lib/crc/crc-t10dif-main.c (renamed from lib/crc-t10dif.c)42
-rw-r--r--lib/crc/crc16.c (renamed from lib/crc16.c)5
-rw-r--r--lib/crc/crc32-main.c105
-rw-r--r--lib/crc/crc4.c (renamed from lib/crc4.c)1
-rw-r--r--lib/crc/crc64-main.c (renamed from lib/crc64.c)51
-rw-r--r--lib/crc/crc7.c (renamed from lib/crc7.c)5
-rw-r--r--lib/crc/crc8.c (renamed from lib/crc8.c)3
-rw-r--r--lib/crc/gen_crc32table.c (renamed from lib/gen_crc32table.c)4
-rw-r--r--lib/crc/gen_crc64table.c (renamed from lib/gen_crc64table.c)11
-rw-r--r--lib/crc/loongarch/crc32.h115
-rw-r--r--lib/crc/mips/crc32.h162
-rw-r--r--lib/crc/powerpc/crc-t10dif.h70
-rw-r--r--lib/crc/powerpc/crc-vpmsum-template.S746
-rw-r--r--lib/crc/powerpc/crc32.h70
-rw-r--r--lib/crc/powerpc/crc32c-vpmsum_asm.S842
-rw-r--r--lib/crc/powerpc/crct10dif-vpmsum_asm.S845
-rw-r--r--lib/crc/riscv/crc-clmul-consts.h122
-rw-r--r--lib/crc/riscv/crc-clmul-template.h265
-rw-r--r--lib/crc/riscv/crc-clmul.h23
-rw-r--r--lib/crc/riscv/crc-t10dif.h18
-rw-r--r--lib/crc/riscv/crc16_msb.c18
-rw-r--r--lib/crc/riscv/crc32.h44
-rw-r--r--lib/crc/riscv/crc32_lsb.c18
-rw-r--r--lib/crc/riscv/crc32_msb.c18
-rw-r--r--lib/crc/riscv/crc64.h27
-rw-r--r--lib/crc/riscv/crc64_lsb.c18
-rw-r--r--lib/crc/riscv/crc64_msb.c18
-rw-r--r--lib/crc/s390/crc32-vx.h12
-rw-r--r--lib/crc/s390/crc32.h67
-rw-r--r--lib/crc/s390/crc32be-vx.c174
-rw-r--r--lib/crc/s390/crc32le-vx.c240
-rw-r--r--lib/crc/sparc/crc32.h67
-rw-r--r--lib/crc/sparc/crc32c_asm.S20
-rw-r--r--lib/crc/tests/Makefile2
-rw-r--r--lib/crc/tests/crc_kunit.c (renamed from lib/tests/crc_kunit.c)99
-rw-r--r--lib/crc/x86/crc-pclmul-consts.h240
-rw-r--r--lib/crc/x86/crc-pclmul-template.S575
-rw-r--r--lib/crc/x86/crc-pclmul-template.h71
-rw-r--r--lib/crc/x86/crc-t10dif.h35
-rw-r--r--lib/crc/x86/crc16-msb-pclmul.S6
-rw-r--r--lib/crc/x86/crc32-pclmul.S6
-rw-r--r--lib/crc/x86/crc32.h137
-rw-r--r--lib/crc/x86/crc32c-3way.S360
-rw-r--r--lib/crc/x86/crc64-pclmul.S7
-rw-r--r--lib/crc/x86/crc64.h48
-rw-r--r--lib/crc32.c126
-rw-r--r--lib/crypto/Kconfig256
-rw-r--r--lib/crypto/Makefile298
-rw-r--r--lib/crypto/aes.c1
-rw-r--r--lib/crypto/aescfb.c15
-rw-r--r--lib/crypto/aesgcm.c51
-rw-r--r--lib/crypto/arc4.c1
-rw-r--r--lib/crypto/arm/.gitignore4
-rw-r--r--lib/crypto/arm/blake2b-neon-core.S350
-rw-r--r--lib/crypto/arm/blake2b.h40
-rw-r--r--lib/crypto/arm/blake2s-core.S309
-rw-r--r--lib/crypto/arm/blake2s.h5
-rw-r--r--lib/crypto/arm/chacha-neon-core.S643
-rw-r--r--lib/crypto/arm/chacha-scalar-core.S444
-rw-r--r--lib/crypto/arm/chacha.h114
-rw-r--r--lib/crypto/arm/curve25519-core.S2062
-rw-r--r--lib/crypto/arm/curve25519.h46
-rw-r--r--lib/crypto/arm/poly1305-armv4.pl1235
-rw-r--r--lib/crypto/arm/poly1305.h51
-rw-r--r--lib/crypto/arm/sha1-armv4-large.S507
-rw-r--r--lib/crypto/arm/sha1-armv7-neon.S633
-rw-r--r--lib/crypto/arm/sha1-ce-core.S123
-rw-r--r--lib/crypto/arm/sha1.h45
-rw-r--r--lib/crypto/arm/sha256-armv4.pl724
-rw-r--r--lib/crypto/arm/sha256-ce.S123
-rw-r--r--lib/crypto/arm/sha256.h46
-rw-r--r--lib/crypto/arm/sha512-armv4.pl657
-rw-r--r--lib/crypto/arm/sha512.h36
-rw-r--r--lib/crypto/arm64/.gitignore4
-rw-r--r--lib/crypto/arm64/chacha-neon-core.S805
-rw-r--r--lib/crypto/arm64/chacha.h96
-rw-r--r--lib/crypto/arm64/poly1305-armv8.pl920
-rw-r--r--lib/crypto/arm64/poly1305.h48
-rw-r--r--lib/crypto/arm64/polyval-ce-core.S359
-rw-r--r--lib/crypto/arm64/polyval.h80
-rw-r--r--lib/crypto/arm64/sha1-ce-core.S130
-rw-r--r--lib/crypto/arm64/sha1.h38
-rw-r--r--lib/crypto/arm64/sha2-armv8.pl786
-rw-r--r--lib/crypto/arm64/sha256-ce.S408
-rw-r--r--lib/crypto/arm64/sha256.h91
-rw-r--r--lib/crypto/arm64/sha3-ce-core.S213
-rw-r--r--lib/crypto/arm64/sha3.h59
-rw-r--r--lib/crypto/arm64/sha512-ce-core.S197
-rw-r--r--lib/crypto/arm64/sha512.h45
-rw-r--r--lib/crypto/blake2b.c174
-rw-r--r--lib/crypto/blake2s-generic.c110
-rw-r--r--lib/crypto/blake2s-selftest.c651
-rw-r--r--lib/crypto/blake2s.c148
-rw-r--r--lib/crypto/chacha-block-generic.c114
-rw-r--r--lib/crypto/chacha.c146
-rw-r--r--lib/crypto/chacha20poly1305.c26
-rw-r--r--lib/crypto/curve25519-generic.c24
-rw-r--r--lib/crypto/curve25519.c69
-rw-r--r--lib/crypto/des.c7
-rw-r--r--lib/crypto/fips.h45
-rw-r--r--lib/crypto/gf128mul.c1
-rw-r--r--lib/crypto/hash_info.c63
-rw-r--r--lib/crypto/libchacha.c36
-rw-r--r--lib/crypto/md5.c322
-rw-r--r--lib/crypto/memneq.c3
-rw-r--r--lib/crypto/mips/.gitignore2
-rw-r--r--lib/crypto/mips/chacha-core.S491
-rw-r--r--lib/crypto/mips/chacha.h14
-rw-r--r--lib/crypto/mips/md5.h65
-rw-r--r--lib/crypto/mips/poly1305-mips.pl1269
-rw-r--r--lib/crypto/mips/poly1305.h14
-rw-r--r--lib/crypto/mips/sha1.h81
-rw-r--r--lib/crypto/mips/sha256.h58
-rw-r--r--lib/crypto/mips/sha512.h74
-rw-r--r--lib/crypto/mpi/mpi-add.c2
-rw-r--r--lib/crypto/mpi/mpi-bit.c2
-rw-r--r--lib/crypto/mpi/mpi-cmp.c2
-rw-r--r--lib/crypto/mpi/mpi-mul.c2
-rw-r--r--lib/crypto/mpi/mpi-pow.c2
-rw-r--r--lib/crypto/mpi/mpi-sub-ui.c2
-rw-r--r--lib/crypto/mpi/mpicoder.c3
-rw-r--r--lib/crypto/mpi/mpiutil.c2
-rw-r--r--lib/crypto/poly1305-donna32.c3
-rw-r--r--lib/crypto/poly1305-donna64.c3
-rw-r--r--lib/crypto/poly1305-generic.c24
-rw-r--r--lib/crypto/poly1305.c82
-rw-r--r--lib/crypto/polyval.c307
-rw-r--r--lib/crypto/powerpc/chacha-p10le-8x.S840
-rw-r--r--lib/crypto/powerpc/chacha.h76
-rw-r--r--lib/crypto/powerpc/curve25519-ppc64le_asm.S671
-rw-r--r--lib/crypto/powerpc/curve25519.h186
-rw-r--r--lib/crypto/powerpc/md5-asm.S235
-rw-r--r--lib/crypto/powerpc/md5.h12
-rw-r--r--lib/crypto/powerpc/poly1305-p10le_64.S1075
-rw-r--r--lib/crypto/powerpc/poly1305.h74
-rw-r--r--lib/crypto/powerpc/sha1-powerpc-asm.S188
-rw-r--r--lib/crypto/powerpc/sha1-spe-asm.S294
-rw-r--r--lib/crypto/powerpc/sha1.h67
-rw-r--r--lib/crypto/powerpc/sha256-spe-asm.S318
-rw-r--r--lib/crypto/powerpc/sha256.h58
-rw-r--r--lib/crypto/riscv/chacha-riscv64-zvkb.S297
-rw-r--r--lib/crypto/riscv/chacha.h51
-rw-r--r--lib/crypto/riscv/poly1305-riscv.pl847
-rw-r--r--lib/crypto/riscv/poly1305.h14
-rw-r--r--lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S225
-rw-r--r--lib/crypto/riscv/sha256.h42
-rw-r--r--lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S203
-rw-r--r--lib/crypto/riscv/sha512.h39
-rw-r--r--lib/crypto/s390/chacha-s390.S908
-rw-r--r--lib/crypto/s390/chacha-s390.h14
-rw-r--r--lib/crypto/s390/chacha.h36
-rw-r--r--lib/crypto/s390/sha1.h28
-rw-r--r--lib/crypto/s390/sha256.h28
-rw-r--r--lib/crypto/s390/sha3.h151
-rw-r--r--lib/crypto/s390/sha512.h28
-rw-r--r--lib/crypto/sha1.c246
-rw-r--r--lib/crypto/sha256-generic.c137
-rw-r--r--lib/crypto/sha256.c512
-rw-r--r--lib/crypto/sha3.c411
-rw-r--r--lib/crypto/sha512.c440
-rw-r--r--lib/crypto/sm3.c1
-rw-r--r--lib/crypto/sparc/md5.h48
-rw-r--r--lib/crypto/sparc/md5_asm.S70
-rw-r--r--lib/crypto/sparc/sha1.h43
-rw-r--r--lib/crypto/sparc/sha1_asm.S72
-rw-r--r--lib/crypto/sparc/sha256.h43
-rw-r--r--lib/crypto/sparc/sha256_asm.S78
-rw-r--r--lib/crypto/sparc/sha512.h42
-rw-r--r--lib/crypto/sparc/sha512_asm.S102
-rw-r--r--lib/crypto/tests/Kconfig118
-rw-r--r--lib/crypto/tests/Makefile12
-rw-r--r--lib/crypto/tests/blake2b-testvecs.h342
-rw-r--r--lib/crypto/tests/blake2b_kunit.c133
-rw-r--r--lib/crypto/tests/blake2s-testvecs.h238
-rw-r--r--lib/crypto/tests/blake2s_kunit.c133
-rw-r--r--lib/crypto/tests/curve25519_kunit.c (renamed from lib/crypto/curve25519-selftest.c)102
-rw-r--r--lib/crypto/tests/hash-test-template.h568
-rw-r--r--lib/crypto/tests/md5-testvecs.h186
-rw-r--r--lib/crypto/tests/md5_kunit.c39
-rw-r--r--lib/crypto/tests/poly1305-testvecs.h186
-rw-r--r--lib/crypto/tests/poly1305_kunit.c165
-rw-r--r--lib/crypto/tests/polyval-testvecs.h186
-rw-r--r--lib/crypto/tests/polyval_kunit.c223
-rw-r--r--lib/crypto/tests/sha1-testvecs.h212
-rw-r--r--lib/crypto/tests/sha1_kunit.c39
-rw-r--r--lib/crypto/tests/sha224-testvecs.h238
-rw-r--r--lib/crypto/tests/sha224_kunit.c39
-rw-r--r--lib/crypto/tests/sha256-testvecs.h238
-rw-r--r--lib/crypto/tests/sha256_kunit.c224
-rw-r--r--lib/crypto/tests/sha3-testvecs.h249
-rw-r--r--lib/crypto/tests/sha384-testvecs.h290
-rw-r--r--lib/crypto/tests/sha384_kunit.c39
-rw-r--r--lib/crypto/tests/sha3_kunit.c422
-rw-r--r--lib/crypto/tests/sha512-testvecs.h342
-rw-r--r--lib/crypto/tests/sha512_kunit.c39
-rw-r--r--lib/crypto/utils.c3
-rw-r--r--lib/crypto/x86/.gitignore2
-rw-r--r--lib/crypto/x86/blake2s-core.S291
-rw-r--r--lib/crypto/x86/blake2s.h62
-rw-r--r--lib/crypto/x86/chacha-avx2-x86_64.S1021
-rw-r--r--lib/crypto/x86/chacha-avx512vl-x86_64.S836
-rw-r--r--lib/crypto/x86/chacha-ssse3-x86_64.S791
-rw-r--r--lib/crypto/x86/chacha.h176
-rw-r--r--lib/crypto/x86/curve25519.h1613
-rw-r--r--lib/crypto/x86/poly1305-x86_64-cryptogams.pl4240
-rw-r--r--lib/crypto/x86/poly1305.h158
-rw-r--r--lib/crypto/x86/polyval-pclmul-avx.S319
-rw-r--r--lib/crypto/x86/polyval.h83
-rw-r--r--lib/crypto/x86/sha1-avx2-asm.S697
-rw-r--r--lib/crypto/x86/sha1-ni-asm.S152
-rw-r--r--lib/crypto/x86/sha1-ssse3-and-avx.S551
-rw-r--r--lib/crypto/x86/sha1.h74
-rw-r--r--lib/crypto/x86/sha256-avx-asm.S493
-rw-r--r--lib/crypto/x86/sha256-avx2-asm.S770
-rw-r--r--lib/crypto/x86/sha256-ni-asm.S559
-rw-r--r--lib/crypto/x86/sha256-ssse3-asm.S505
-rw-r--r--lib/crypto/x86/sha256.h95
-rw-r--r--lib/crypto/x86/sha512-avx-asm.S420
-rw-r--r--lib/crypto/x86/sha512-avx2-asm.S748
-rw-r--r--lib/crypto/x86/sha512-ssse3-asm.S419
-rw-r--r--lib/crypto/x86/sha512.h52
-rw-r--r--lib/debugobjects.c6
-rw-r--r--lib/decompress.c21
-rw-r--r--lib/digsig.c47
-rw-r--r--lib/dump_stack.c2
-rw-r--r--lib/fault-inject-usercopy.c4
-rw-r--r--lib/find_bit.c24
-rw-r--r--lib/find_bit_benchmark_rust.rs104
-rw-r--r--lib/genalloc.c5
-rw-r--r--lib/group_cpus.c25
-rw-r--r--lib/interval_tree.c1
-rw-r--r--lib/iov_iter.c117
-rw-r--r--lib/kobject_uevent.c20
-rw-r--r--lib/kunit/Kconfig24
-rw-r--r--lib/kunit/Makefile2
-rw-r--r--lib/kunit/kunit-example-test.c217
-rw-r--r--lib/kunit/kunit-test.c57
-rw-r--r--lib/kunit/test.c150
-rw-r--r--lib/kunit/try-catch-impl.h4
-rw-r--r--lib/kunit/try-catch.c29
-rw-r--r--lib/kunit/user_alloc.c4
-rw-r--r--lib/locking-selftest.c4
-rw-r--r--lib/lzo/lzo1x_compress.c2
-rw-r--r--lib/lzo/lzo1x_decompress_safe.c6
-rw-r--r--lib/maple_tree.c748
-rw-r--r--lib/math/div64.c13
-rw-r--r--lib/math/gcd.c27
-rw-r--r--lib/raid6/algos.c3
-rw-r--r--lib/raid6/neon.c17
-rw-r--r--lib/raid6/recov.c6
-rw-r--r--lib/raid6/recov_avx2.c6
-rw-r--r--lib/raid6/recov_avx512.c6
-rw-r--r--lib/raid6/recov_loongarch_simd.c12
-rw-r--r--lib/raid6/recov_neon.c21
-rw-r--r--lib/raid6/recov_rvv.c8
-rw-r--r--lib/raid6/recov_s390xc.c7
-rw-r--r--lib/raid6/recov_ssse3.c6
-rw-r--r--lib/raid6/rvv.c111
-rw-r--r--lib/ref_tracker.c295
-rw-r--r--lib/rhashtable.c4
-rw-r--r--lib/sbitmap.c74
-rw-r--r--lib/smp_processor_id.c2
-rw-r--r--lib/stackdepot.c67
-rw-r--r--lib/strncpy_from_user.c2
-rw-r--r--lib/strnlen_user.c2
-rw-r--r--lib/sys_info.c123
-rw-r--r--lib/test_firmware.c7
-rw-r--r--lib/test_hmm.c16
-rw-r--r--lib/test_kho.c339
-rw-r--r--lib/test_maple_tree.c171
-rw-r--r--lib/test_objagg.c81
-rw-r--r--lib/test_objpool.c2
-rw-r--r--lib/test_vmalloc.c42
-rw-r--r--lib/tests/Makefile4
-rw-r--r--lib/tests/ffs_kunit.c566
-rw-r--r--lib/tests/fortify_kunit.c4
-rw-r--r--lib/tests/longest_symbol_kunit.c3
-rw-r--r--lib/tests/seq_buf_kunit.c208
-rw-r--r--lib/tests/test_bits.c19
-rw-r--r--lib/tests/test_ratelimit.c144
-rw-r--r--lib/ubsan.c6
-rw-r--r--lib/vdso/Kconfig25
-rw-r--r--lib/vdso/Makefile2
-rw-r--r--lib/vdso/datastore.c6
-rw-r--r--lib/vdso/gettimeofday.c237
-rw-r--r--lib/vsprintf.c70
-rw-r--r--lib/xarray.c5
-rw-r--r--lib/xxhash.c107
313 files changed, 56617 insertions, 3320 deletions
diff --git a/lib/Kconfig b/lib/Kconfig
index 6c1b8f184267..e629449dd2a3 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -136,95 +136,9 @@ config TRACE_MMIO_ACCESS
Create tracepoints for MMIO read/write operations. These trace events
can be used for logging all MMIO read/write operations.
+source "lib/crc/Kconfig"
source "lib/crypto/Kconfig"
-config CRC_CCITT
- tristate
- help
- The CRC-CCITT library functions. Select this if your module uses any
- of the functions from <linux/crc-ccitt.h>.
-
-config CRC16
- tristate
- help
- The CRC16 library functions. Select this if your module uses any of
- the functions from <linux/crc16.h>.
-
-config CRC_T10DIF
- tristate
- help
- The CRC-T10DIF library functions. Select this if your module uses
- any of the functions from <linux/crc-t10dif.h>.
-
-config ARCH_HAS_CRC_T10DIF
- bool
-
-config CRC_T10DIF_ARCH
- tristate
- default CRC_T10DIF if ARCH_HAS_CRC_T10DIF && CRC_OPTIMIZATIONS
-
-config CRC_ITU_T
- tristate
- help
- The CRC-ITU-T library functions. Select this if your module uses
- any of the functions from <linux/crc-itu-t.h>.
-
-config CRC32
- tristate
- select BITREVERSE
- help
- The CRC32 library functions. Select this if your module uses any of
- the functions from <linux/crc32.h> or <linux/crc32c.h>.
-
-config ARCH_HAS_CRC32
- bool
-
-config CRC32_ARCH
- tristate
- default CRC32 if ARCH_HAS_CRC32 && CRC_OPTIMIZATIONS
-
-config CRC64
- tristate
- help
- The CRC64 library functions. Select this if your module uses any of
- the functions from <linux/crc64.h>.
-
-config ARCH_HAS_CRC64
- bool
-
-config CRC64_ARCH
- tristate
- default CRC64 if ARCH_HAS_CRC64 && CRC_OPTIMIZATIONS
-
-config CRC4
- tristate
- help
- The CRC4 library functions. Select this if your module uses any of
- the functions from <linux/crc4.h>.
-
-config CRC7
- tristate
- help
- The CRC7 library functions. Select this if your module uses any of
- the functions from <linux/crc7.h>.
-
-config CRC8
- tristate
- help
- The CRC8 library functions. Select this if your module uses any of
- the functions from <linux/crc8.h>.
-
-config CRC_OPTIMIZATIONS
- bool "Enable optimized CRC implementations" if EXPERT
- default y
- help
- Disabling this option reduces code size slightly by disabling the
- architecture-optimized implementations of any CRC variants that are
- enabled. CRC checksumming performance may get much slower.
-
- Keep this enabled unless you're really trying to minimize the size of
- the kernel.
-
config XXHASH
tristate
@@ -563,8 +477,7 @@ config MPILIB
config SIGNATURE
tristate
depends on KEYS
- select CRYPTO
- select CRYPTO_SHA1
+ select CRYPTO_LIB_SHA1
select MPILIB
help
Digital signature verification. Currently only RSA is supported.
@@ -716,6 +629,7 @@ config GENERIC_LIB_DEVMEM_IS_ALLOWED
config PLDMFW
bool
+ select CRC32
default n
config ASN1_ENCODER
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ebe33181b6e6..742b23ef0d8b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -206,6 +206,16 @@ config DEBUG_BUGVERBOSE
of the BUG call as well as the EIP and oops trace. This aids
debugging but costs about 70-100K of memory.
+config DEBUG_BUGVERBOSE_DETAILED
+ bool "Verbose WARN_ON_ONCE() reporting (adds 100K)" if DEBUG_BUGVERBOSE
+ help
+ Say Y here to make WARN_ON_ONCE() output the condition string of the
+ warning, in addition to the file name and line number.
+ This helps debugging, but costs about 100K of memory.
+
+ Say N if unsure.
+
+
endmenu # "printk and dmesg options"
config DEBUG_KERNEL
@@ -259,7 +269,7 @@ config DEBUG_INFO_NONE
config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT
bool "Rely on the toolchain's implicit default DWARF version"
select DEBUG_INFO
- depends on !CC_IS_CLANG || AS_IS_LLVM || CLANG_VERSION < 140000 || (AS_IS_GNU && AS_VERSION >= 23502 && AS_HAS_NON_CONST_ULEB128)
+ depends on !CC_IS_CLANG || AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502 && AS_HAS_NON_CONST_ULEB128)
help
The implicit default version of DWARF debug info produced by a
toolchain changes over time.
@@ -445,8 +455,7 @@ config FRAME_WARN
default 2048 if GCC_PLUGIN_LATENT_ENTROPY
default 2048 if PARISC
default 1536 if (!64BIT && XTENSA)
- default 1280 if KASAN && !64BIT
- default 1024 if !64BIT
+ default 1280 if !64BIT
default 2048 if 64BIT
help
Tell the compiler to warn at build time for stack frames larger than this.
@@ -1067,12 +1076,6 @@ config PANIC_ON_OOPS
Say N if unsure.
-config PANIC_ON_OOPS_VALUE
- int
- range 0 1
- default 0 if !PANIC_ON_OOPS
- default 1 if PANIC_ON_OOPS
-
config PANIC_TIMEOUT
int "panic timeout"
default 0
@@ -2460,6 +2463,15 @@ config SCANF_KUNIT_TEST
If unsure, say N.
+config SEQ_BUF_KUNIT_TEST
+ tristate "KUnit test for seq_buf" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ This builds unit tests for the seq_buf library.
+
+ If unsure, say N.
+
config STRING_KUNIT_TEST
tristate "KUnit test string functions at runtime" if !KUNIT_ALL_TESTS
depends on KUNIT
@@ -2470,6 +2482,20 @@ config STRING_HELPERS_KUNIT_TEST
depends on KUNIT
default KUNIT_ALL_TESTS
+config FFS_KUNIT_TEST
+ tristate "KUnit test ffs-family functions at runtime" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ This builds KUnit tests for ffs-family bit manipulation functions
+ including ffs(), __ffs(), fls(), __fls(), fls64(), and __ffs64().
+
+ These tests validate mathematical correctness, edge case handling,
+ and cross-architecture consistency of bit scanning functions.
+
+ For more information on KUnit and unit tests in general,
+ please refer to Documentation/dev-tools/kunit/.
+
config TEST_KSTRTOX
tristate "Test kstrto*() family of functions at runtime"
@@ -2506,8 +2532,8 @@ config TEST_IDA
tristate "Perform selftest on IDA functions"
config TEST_MISC_MINOR
- tristate "miscdevice KUnit test" if !KUNIT_ALL_TESTS
- depends on KUNIT
+ bool "miscdevice KUnit test" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y
default KUNIT_ALL_TESTS
help
Kunit test for miscdevice API, specially its behavior in respect to
@@ -2598,6 +2624,19 @@ config FIND_BIT_BENCHMARK
If unsure, say N.
+config FIND_BIT_BENCHMARK_RUST
+ tristate "Test find_bit functions in Rust"
+ depends on RUST
+ help
+ This builds the "find_bit_benchmark_rust" module. It is a micro
+ benchmark that measures the performance of Rust functions that
+ correspond to the find_*_bit() operations in C. It follows the
+ FIND_BIT_BENCHMARK closely but will in general not yield same
+ numbers due to extra bounds checks and overhead of foreign
+ function calls.
+
+ If unsure, say N.
+
config TEST_FIRMWARE
tristate "Test firmware loading via userspace interface"
depends on FW_LOADER
@@ -2885,6 +2924,7 @@ config FORTIFY_KUNIT_TEST
config LONGEST_SYM_KUNIT_TEST
tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS
depends on KUNIT && KPROBES
+ depends on !PREFIX_SYMBOLS && !CFI && !GCOV_KERNEL
default KUNIT_ALL_TESTS
help
Tests the longest symbol possible
@@ -2901,27 +2941,6 @@ config HW_BREAKPOINT_KUNIT_TEST
If unsure, say N.
-config CRC_KUNIT_TEST
- tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS
- depends on KUNIT
- default KUNIT_ALL_TESTS
- select CRC7
- select CRC16
- select CRC_T10DIF
- select CRC32
- select CRC64
- help
- Unit tests for the CRC library functions.
-
- This is intended to help people writing architecture-specific
- optimized versions. If unsure, say N.
-
-config CRC_BENCHMARK
- bool "Benchmark for the CRC functions"
- depends on CRC_KUNIT_TEST
- help
- Include benchmarks in the KUnit test suite for the CRC functions.
-
config SIPHASH_KUNIT_TEST
tristate "Perform selftest on siphash functions" if !KUNIT_ALL_TESTS
depends on KUNIT
@@ -3225,6 +3244,37 @@ config TEST_OBJPOOL
If unsure, say N.
+config TEST_KEXEC_HANDOVER
+ bool "Test for Kexec HandOver"
+ default n
+ depends on KEXEC_HANDOVER
+ help
+ This option enables test for Kexec HandOver (KHO).
+ The test consists of two parts: saving kernel data before kexec and
+ restoring the data after kexec and verifying that it was properly
+ handed over. This test module creates and saves data on the boot of
+ the first kernel and restores and verifies the data on the boot of
+ kexec'ed kernel.
+
+ For detailed documentation about KHO, see Documentation/core-api/kho.
+
+ To run the test run:
+
+ tools/testing/selftests/kho/vmtest.sh -h
+
+ If unsure, say N.
+
+config RATELIMIT_KUNIT_TEST
+ tristate "KUnit Test for correctness and stress of ratelimit" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ This builds the "test_ratelimit" module that should be used
+ for correctness verification and concurrent testings of rate
+ limiting.
+
+ If unsure, say N.
+
config INT_POW_KUNIT_TEST
tristate "Integer exponentiation (int_pow) test" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index f82889a830fa..a4bb610a7a6f 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -19,6 +19,18 @@ config ARCH_DISABLE_KASAN_INLINE
Disables both inline and stack instrumentation. Selected by
architectures that do not support these instrumentation types.
+config ARCH_NEEDS_DEFER_KASAN
+ bool
+
+config ARCH_DEFER_KASAN
+ def_bool y
+ depends on KASAN && ARCH_NEEDS_DEFER_KASAN
+ help
+ Architectures should select this if they need to defer KASAN
+ initialization until shadow memory is properly set up. This
+ enables runtime control via static keys. Otherwise, KASAN uses
+ compile-time constants for better performance.
+
config CC_HAS_KASAN_GENERIC
def_bool $(cc-option, -fsanitize=kernel-address)
diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan
index 609ddfc73de5..4ce4b0c0109c 100644
--- a/lib/Kconfig.kcsan
+++ b/lib/Kconfig.kcsan
@@ -185,12 +185,6 @@ config KCSAN_WEAK_MEMORY
bool "Enable weak memory modeling to detect missing memory barriers"
default y
depends on KCSAN_STRICT
- # We can either let objtool nop __tsan_func_{entry,exit}() and builtin
- # atomics instrumentation in .noinstr.text, or use a compiler that can
- # implement __no_kcsan to really remove all instrumentation.
- depends on !ARCH_WANTS_NO_INSTR || HAVE_NOINSTR_HACK || \
- CC_IS_GCC || CLANG_VERSION >= 140000
- select OBJTOOL if HAVE_NOINSTR_HACK
help
Enable support for modeling a subset of weak memory, which allows
detecting a subset of data races due to missing memory barriers.
diff --git a/lib/Kconfig.kmsan b/lib/Kconfig.kmsan
index 0541d7b079cc..cae1ddcc18e1 100644
--- a/lib/Kconfig.kmsan
+++ b/lib/Kconfig.kmsan
@@ -3,10 +3,7 @@ config HAVE_ARCH_KMSAN
bool
config HAVE_KMSAN_COMPILER
- # Clang versions <14.0.0 also support -fsanitize=kernel-memory, but not
- # all the features necessary to build the kernel with KMSAN.
- depends on CC_IS_CLANG && CLANG_VERSION >= 140000
- def_bool $(cc-option,-fsanitize=kernel-memory -mllvm -msan-disable-checks=1)
+ def_bool $(cc-option,-fsanitize=kernel-memory)
config KMSAN
bool "KMSAN: detector of uninitialized values use"
@@ -28,15 +25,9 @@ config KMSAN
if KMSAN
-config HAVE_KMSAN_PARAM_RETVAL
- # -fsanitize-memory-param-retval is supported only by Clang >= 14.
- depends on HAVE_KMSAN_COMPILER
- def_bool $(cc-option,-fsanitize=kernel-memory -fsanitize-memory-param-retval)
-
config KMSAN_CHECK_PARAM_RETVAL
bool "Check for uninitialized values passed to and returned from functions"
default y
- depends on HAVE_KMSAN_PARAM_RETVAL
help
If the compiler supports -fsanitize-memory-param-retval, KMSAN will
eagerly check every function parameter passed by value and every
diff --git a/lib/Makefile b/lib/Makefile
index c38582f187dd..1ab2c4be3b66 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -40,7 +40,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
nmi_backtrace.o win_minmax.o memcat_p.o \
- buildid.o objpool.o iomem_copy.o
+ buildid.o objpool.o iomem_copy.o sys_info.o
lib-$(CONFIG_UNION_FIND) += union_find.o
lib-$(CONFIG_PRINTK) += dump_stack.o
@@ -62,6 +62,7 @@ obj-y += hexdump.o
obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
obj-y += kstrtox.o
obj-$(CONFIG_FIND_BIT_BENCHMARK) += find_bit_benchmark.o
+obj-$(CONFIG_FIND_BIT_BENCHMARK_RUST) += find_bit_benchmark_rust.o
obj-$(CONFIG_TEST_BPF) += test_bpf.o
test_dhry-objs := dhry_1.o dhry_2.o dhry_run.o
obj-$(CONFIG_TEST_DHRY) += test_dhry.o
@@ -102,17 +103,14 @@ obj-$(CONFIG_TEST_HMM) += test_hmm.o
obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o
obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o
obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o
+obj-$(CONFIG_TEST_KEXEC_HANDOVER) += test_kho.o
obj-$(CONFIG_TEST_FPU) += test_fpu.o
test_fpu-y := test_fpu_glue.o test_fpu_impl.o
CFLAGS_test_fpu_impl.o += $(CC_FLAGS_FPU)
CFLAGS_REMOVE_test_fpu_impl.o += $(CC_FLAGS_NO_FPU)
-# Some KUnit files (hooks.o) need to be built-in even when KUnit is a module,
-# so we can't just use obj-$(CONFIG_KUNIT).
-ifdef CONFIG_KUNIT
obj-y += kunit/
-endif
ifeq ($(CONFIG_DEBUG_KOBJECT),y)
CFLAGS_kobject.o += -DDEBUG
@@ -122,7 +120,7 @@ endif
obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o
CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any)
-obj-y += math/ crypto/ tests/ vdso/
+obj-y += math/ crc/ crypto/ tests/ vdso/
obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
@@ -148,15 +146,6 @@ obj-$(CONFIG_BITREVERSE) += bitrev.o
obj-$(CONFIG_LINEAR_RANGES) += linear_ranges.o
obj-$(CONFIG_PACKING) += packing.o
obj-$(CONFIG_PACKING_KUNIT_TEST) += packing_test.o
-obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o
-obj-$(CONFIG_CRC16) += crc16.o
-obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o
-obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o
-obj-$(CONFIG_CRC32) += crc32.o
-obj-$(CONFIG_CRC64) += crc64.o
-obj-$(CONFIG_CRC4) += crc4.o
-obj-$(CONFIG_CRC7) += crc7.o
-obj-$(CONFIG_CRC8) += crc8.o
obj-$(CONFIG_XXHASH) += xxhash.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
@@ -294,27 +283,6 @@ obj-$(CONFIG_ASN1_ENCODER) += asn1_encoder.o
obj-$(CONFIG_FONT_SUPPORT) += fonts/
-hostprogs := gen_crc32table
-hostprogs += gen_crc64table
-clean-files := crc32table.h
-clean-files += crc64table.h
-
-$(obj)/crc32.o: $(obj)/crc32table.h
-
-quiet_cmd_crc32 = GEN $@
- cmd_crc32 = $< > $@
-
-$(obj)/crc32table.h: $(obj)/gen_crc32table
- $(call cmd,crc32)
-
-$(obj)/crc64.o: $(obj)/crc64table.h
-
-quiet_cmd_crc64 = GEN $@
- cmd_crc64 = $< > $@
-
-$(obj)/crc64table.h: $(obj)/gen_crc64table
- $(call cmd,crc64)
-
#
# Build a fast OID lookip registry from include/linux/oid_registry.h
#
@@ -337,7 +305,7 @@ obj-$(CONFIG_UBSAN) += ubsan.o
UBSAN_SANITIZE_ubsan.o := n
KASAN_SANITIZE_ubsan.o := n
KCSAN_SANITIZE_ubsan.o := n
-CFLAGS_ubsan.o := -fno-stack-protector $(DISABLE_STACKLEAK_PLUGIN)
+CFLAGS_ubsan.o := -fno-stack-protector $(DISABLE_KSTACK_ERASE)
obj-$(CONFIG_SBITMAP) += sbitmap.o
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index d48b80f3f007..f26456988445 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -9,7 +9,9 @@
#include <linux/proc_fs.h>
#include <linux/seq_buf.h>
#include <linux/seq_file.h>
+#include <linux/string_choices.h>
#include <linux/vmalloc.h>
+#include <linux/kmemleak.h>
#define ALLOCINFO_FILE_NAME "allocinfo"
#define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag))
@@ -24,8 +26,10 @@ static bool mem_profiling_support;
static struct codetag_type *alloc_tag_cttype;
+#ifdef CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU
DEFINE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
EXPORT_SYMBOL(_shared_alloc_tag);
+#endif
DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
mem_alloc_profiling_key);
@@ -45,21 +49,16 @@ struct allocinfo_private {
static void *allocinfo_start(struct seq_file *m, loff_t *pos)
{
struct allocinfo_private *priv;
- struct codetag *ct;
loff_t node = *pos;
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- m->private = priv;
- if (!priv)
- return NULL;
-
- priv->print_header = (node == 0);
+ priv = (struct allocinfo_private *)m->private;
codetag_lock_module_list(alloc_tag_cttype, true);
- priv->iter = codetag_get_ct_iter(alloc_tag_cttype);
- while ((ct = codetag_next_ct(&priv->iter)) != NULL && node)
- node--;
-
- return ct ? priv : NULL;
+ if (node == 0) {
+ priv->print_header = true;
+ priv->iter = codetag_get_ct_iter(alloc_tag_cttype);
+ codetag_next_ct(&priv->iter);
+ }
+ return priv->iter.ct ? priv : NULL;
}
static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
@@ -76,18 +75,13 @@ static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
static void allocinfo_stop(struct seq_file *m, void *arg)
{
- struct allocinfo_private *priv = (struct allocinfo_private *)m->private;
-
- if (priv) {
- codetag_lock_module_list(alloc_tag_cttype, false);
- kfree(priv);
- }
+ codetag_lock_module_list(alloc_tag_cttype, false);
}
static void print_allocinfo_header(struct seq_buf *buf)
{
/* Output format version, so we can change it. */
- seq_buf_printf(buf, "allocinfo - version: 1.0\n");
+ seq_buf_printf(buf, "allocinfo - version: 2.0\n");
seq_buf_printf(buf, "# <size> <calls> <tag info>\n");
}
@@ -99,6 +93,8 @@ static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls);
codetag_to_text(out, ct);
+ if (unlikely(alloc_tag_is_inaccurate(tag)))
+ seq_buf_printf(out, " accurate:no");
seq_buf_putc(out, ' ');
seq_buf_putc(out, '\n');
}
@@ -134,6 +130,9 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl
struct codetag_bytes n;
unsigned int i, nr = 0;
+ if (IS_ERR_OR_NULL(alloc_tag_cttype))
+ return 0;
+
if (can_sleep)
codetag_lock_module_list(alloc_tag_cttype, true);
else if (!codetag_trylock_module_list(alloc_tag_cttype))
@@ -442,9 +441,10 @@ static int vm_module_tags_populate(void)
if (nr < more_pages ||
vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL,
next_page, PAGE_SHIFT) < 0) {
+ release_pages_arg arg = { .pages = next_page };
+
/* Clean up and error out */
- for (int i = 0; i < nr; i++)
- __free_page(next_page[i]);
+ release_pages(arg, nr);
return -ENOMEM;
}
@@ -632,8 +632,13 @@ static int load_module(struct module *mod, struct codetag *start, struct codetag
mod->name);
return -ENOMEM;
}
- }
+ /*
+ * Avoid a kmemleak false positive. The pointer to the counters is stored
+ * in the alloc_tag section of the module and cannot be directly accessed.
+ */
+ kmemleak_ignore_percpu(tag->counters);
+ }
return 0;
}
@@ -681,11 +686,10 @@ static int __init alloc_mod_tags_mem(void)
static void __init free_mod_tags_mem(void)
{
- int i;
+ release_pages_arg arg = { .pages = vm_module_tags->pages };
module_tags.start_addr = 0;
- for (i = 0; i < vm_module_tags->nr_pages; i++)
- __free_page(vm_module_tags->pages[i]);
+ release_pages(arg, vm_module_tags->nr_pages);
kfree(vm_module_tags->pages);
free_vm_area(vm_module_tags);
}
@@ -725,7 +729,7 @@ static int __init setup_early_mem_profiling(char *str)
}
mem_profiling_support = true;
pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n",
- compressed ? "with" : "without", enable ? "on" : "off");
+ compressed ? "with" : "without", str_on_off(enable));
}
if (enable != mem_alloc_profiling_enabled()) {
@@ -765,6 +769,20 @@ struct page_ext_operations page_alloc_tagging_ops = {
EXPORT_SYMBOL(page_alloc_tagging_ops);
#ifdef CONFIG_SYSCTL
+/*
+ * Not using proc_do_static_key() directly to prevent enabling profiling
+ * after it was shut down.
+ */
+static int proc_mem_profiling_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (!mem_profiling_support && write)
+ return -EINVAL;
+
+ return proc_do_static_key(table, write, buffer, lenp, ppos);
+}
+
+
static struct ctl_table memory_allocation_profiling_sysctls[] = {
{
.procname = "mem_profiling",
@@ -774,7 +792,7 @@ static struct ctl_table memory_allocation_profiling_sysctls[] = {
#else
.mode = 0644,
#endif
- .proc_handler = proc_do_static_key,
+ .proc_handler = proc_mem_profiling_handler,
},
};
@@ -811,7 +829,8 @@ static int __init alloc_tag_init(void)
return 0;
}
- if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) {
+ if (!proc_create_seq_private(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op,
+ sizeof(struct allocinfo_private), NULL)) {
pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME);
shutdown_mem_profiling(false);
return -ENOMEM;
diff --git a/lib/bitmap.c b/lib/bitmap.c
index b97692854966..9dc526507875 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -355,6 +355,12 @@ unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
}
EXPORT_SYMBOL(__bitmap_weight_andnot);
+unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1,
+ const unsigned long *bitmap2, unsigned int bits)
+{
+ return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] | bitmap2[idx]; dst[idx]; }), bits);
+}
+
void __bitmap_set(unsigned long *map, unsigned int start, int len)
{
unsigned long *p = map + BIT_WORD(start);
diff --git a/lib/btree.c b/lib/btree.c
index bb81d3393ac5..9c80c0c7bba8 100644
--- a/lib/btree.c
+++ b/lib/btree.c
@@ -653,9 +653,9 @@ int btree_merge(struct btree_head *target, struct btree_head *victim,
* walks to remove a single object from the victim.
*/
for (;;) {
- if (!btree_last(victim, geo, key))
+ val = btree_last(victim, geo, key);
+ if (!val)
break;
- val = btree_lookup(victim, geo, key);
err = btree_insert(target, geo, key, val, gfp);
if (err)
return err;
diff --git a/lib/bug.c b/lib/bug.c
index b1f07459c2ee..edd9041f89f3 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -139,6 +139,29 @@ void bug_get_file_line(struct bug_entry *bug, const char **file,
#endif
}
+static const char *bug_get_format(struct bug_entry *bug)
+{
+ const char *format = NULL;
+#ifdef HAVE_ARCH_BUG_FORMAT
+#ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
+ /*
+ * Allow an architecture to:
+ * - relative encode NULL (difficult vs KASLR);
+ * - use a literal 0 (there are no valid objects inside
+ * the __bug_table itself to refer to after all);
+ * - use an empty string.
+ */
+ if (bug->format_disp)
+ format = (const char *)&bug->format_disp + bug->format_disp;
+ if (format && format[0] == '\0')
+ format = NULL;
+#else
+ format = bug->format;
+#endif
+#endif
+ return format;
+}
+
struct bug_entry *find_bug(unsigned long bugaddr)
{
struct bug_entry *bug;
@@ -150,26 +173,51 @@ struct bug_entry *find_bug(unsigned long bugaddr)
return module_find_bug(bugaddr);
}
-static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs)
+static void __warn_printf(const char *fmt, struct pt_regs *regs)
{
- struct bug_entry *bug;
- const char *file;
- unsigned line, warning, once, done;
+ if (!fmt)
+ return;
+
+#ifdef HAVE_ARCH_BUG_FORMAT_ARGS
+ if (regs) {
+ struct arch_va_list _args;
+ va_list *args = __warn_args(&_args, regs);
+
+ if (args) {
+ vprintk(fmt, *args);
+ return;
+ }
+ }
+#endif
+
+ printk("%s", fmt);
+}
- if (!is_valid_bugaddr(bugaddr))
- return BUG_TRAP_TYPE_NONE;
+static enum bug_trap_type __report_bug(struct bug_entry *bug, unsigned long bugaddr, struct pt_regs *regs)
+{
+ bool warning, once, done, no_cut, has_args;
+ const char *file, *fmt;
+ unsigned line;
+
+ if (!bug) {
+ if (!is_valid_bugaddr(bugaddr))
+ return BUG_TRAP_TYPE_NONE;
- bug = find_bug(bugaddr);
- if (!bug)
- return BUG_TRAP_TYPE_NONE;
+ bug = find_bug(bugaddr);
+ if (!bug)
+ return BUG_TRAP_TYPE_NONE;
+ }
disable_trace_on_warning();
bug_get_file_line(bug, &file, &line);
+ fmt = bug_get_format(bug);
- warning = (bug->flags & BUGFLAG_WARNING) != 0;
- once = (bug->flags & BUGFLAG_ONCE) != 0;
- done = (bug->flags & BUGFLAG_DONE) != 0;
+ warning = bug->flags & BUGFLAG_WARNING;
+ once = bug->flags & BUGFLAG_ONCE;
+ done = bug->flags & BUGFLAG_DONE;
+ no_cut = bug->flags & BUGFLAG_NO_CUT_HERE;
+ has_args = bug->flags & BUGFLAG_ARGS;
if (warning && once) {
if (done)
@@ -187,8 +235,10 @@ static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *re
* "cut here" line now. WARN() issues its own "cut here" before the
* extra debugging message it writes before triggering the handler.
*/
- if ((bug->flags & BUGFLAG_NO_CUT_HERE) == 0)
+ if (!no_cut) {
printk(KERN_DEFAULT CUT_HERE);
+ __warn_printf(fmt, has_args ? regs : NULL);
+ }
if (warning) {
/* this is a WARN_ON rather than BUG/BUG_ON */
@@ -206,13 +256,25 @@ static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *re
return BUG_TRAP_TYPE_BUG;
}
+enum bug_trap_type report_bug_entry(struct bug_entry *bug, struct pt_regs *regs)
+{
+ enum bug_trap_type ret;
+ bool rcu = false;
+
+ rcu = warn_rcu_enter();
+ ret = __report_bug(bug, 0, regs);
+ warn_rcu_exit(rcu);
+
+ return ret;
+}
+
enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
{
enum bug_trap_type ret;
bool rcu = false;
rcu = warn_rcu_enter();
- ret = __report_bug(bugaddr, regs);
+ ret = __report_bug(NULL, bugaddr, regs);
warn_rcu_exit(rcu);
return ret;
diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c
index fb8c0c5c2bd2..8778ec44bf63 100644
--- a/lib/clz_ctz.c
+++ b/lib/clz_ctz.c
@@ -15,28 +15,28 @@
#include <linux/kernel.h>
int __weak __ctzsi2(int val);
-int __weak __ctzsi2(int val)
+int __weak __attribute_const__ __ctzsi2(int val)
{
return __ffs(val);
}
EXPORT_SYMBOL(__ctzsi2);
int __weak __clzsi2(int val);
-int __weak __clzsi2(int val)
+int __weak __attribute_const__ __clzsi2(int val)
{
return 32 - fls(val);
}
EXPORT_SYMBOL(__clzsi2);
int __weak __clzdi2(u64 val);
-int __weak __clzdi2(u64 val)
+int __weak __attribute_const__ __clzdi2(u64 val)
{
return 64 - fls64(val);
}
EXPORT_SYMBOL(__clzdi2);
int __weak __ctzdi2(u64 val);
-int __weak __ctzdi2(u64 val)
+int __weak __attribute_const__ __ctzdi2(u64 val)
{
return __ffs64(val);
}
diff --git a/lib/codetag.c b/lib/codetag.c
index 650d54d7e14d..545911cebd25 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -11,8 +11,14 @@ struct codetag_type {
struct list_head link;
unsigned int count;
struct idr mod_idr;
- struct rw_semaphore mod_lock; /* protects mod_idr */
+ /*
+ * protects mod_idr, next_mod_seq,
+ * iter->mod_seq and cmod->mod_seq
+ */
+ struct rw_semaphore mod_lock;
struct codetag_type_desc desc;
+ /* generates unique sequence number for module load */
+ unsigned long next_mod_seq;
};
struct codetag_range {
@@ -23,6 +29,7 @@ struct codetag_range {
struct codetag_module {
struct module *mod;
struct codetag_range range;
+ unsigned long mod_seq;
};
static DEFINE_MUTEX(codetag_lock);
@@ -48,6 +55,7 @@ struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype)
.cmod = NULL,
.mod_id = 0,
.ct = NULL,
+ .mod_seq = 0,
};
return iter;
@@ -91,11 +99,13 @@ struct codetag *codetag_next_ct(struct codetag_iterator *iter)
if (!cmod)
break;
- if (cmod != iter->cmod) {
+ if (!iter->cmod || iter->mod_seq != cmod->mod_seq) {
iter->cmod = cmod;
+ iter->mod_seq = cmod->mod_seq;
ct = get_first_module_ct(cmod);
- } else
+ } else {
ct = get_next_module_ct(iter);
+ }
if (ct)
break;
@@ -191,6 +201,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
cmod->range = range;
down_write(&cttype->mod_lock);
+ cmod->mod_seq = ++cttype->next_mod_seq;
mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
if (mod_id >= 0) {
if (cttype->desc.module_load) {
diff --git a/lib/crc/.gitignore b/lib/crc/.gitignore
new file mode 100644
index 000000000000..a9e48103c9fb
--- /dev/null
+++ b/lib/crc/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/crc32table.h
+/crc64table.h
+/gen_crc32table
+/gen_crc64table
diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
new file mode 100644
index 000000000000..70e7a6016de3
--- /dev/null
+++ b/lib/crc/Kconfig
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# Kconfig for the kernel's cyclic redundancy check (CRC) library code
+
+config CRC4
+ tristate
+ help
+ The CRC4 library functions. Select this if your module uses any of
+ the functions from <linux/crc4.h>.
+
+config CRC7
+ tristate
+ help
+ The CRC7 library functions. Select this if your module uses any of
+ the functions from <linux/crc7.h>.
+
+config CRC8
+ tristate
+ help
+ The CRC8 library functions. Select this if your module uses any of
+ the functions from <linux/crc8.h>.
+
+config CRC16
+ tristate
+ help
+ The CRC16 library functions. Select this if your module uses any of
+ the functions from <linux/crc16.h>.
+
+config CRC_CCITT
+ tristate
+ help
+ The CRC-CCITT library functions. Select this if your module uses any
+ of the functions from <linux/crc-ccitt.h>.
+
+config CRC_ITU_T
+ tristate
+ help
+ The CRC-ITU-T library functions. Select this if your module uses
+ any of the functions from <linux/crc-itu-t.h>.
+
+config CRC_T10DIF
+ tristate
+ help
+ The CRC-T10DIF library functions. Select this if your module uses
+ any of the functions from <linux/crc-t10dif.h>.
+
+config CRC_T10DIF_ARCH
+ bool
+ depends on CRC_T10DIF && CRC_OPTIMIZATIONS
+ default y if ARM && KERNEL_MODE_NEON
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if PPC64 && ALTIVEC
+ default y if RISCV && RISCV_ISA_ZBC
+ default y if X86
+
+config CRC32
+ tristate
+ select BITREVERSE
+ help
+ The CRC32 library functions. Select this if your module uses any of
+ the functions from <linux/crc32.h> or <linux/crc32c.h>.
+
+config CRC32_ARCH
+ bool
+ depends on CRC32 && CRC_OPTIMIZATIONS
+ default y if ARM && KERNEL_MODE_NEON
+ default y if ARM64
+ default y if LOONGARCH
+ default y if MIPS && CPU_MIPSR6
+ default y if PPC64 && ALTIVEC
+ default y if RISCV && RISCV_ISA_ZBC
+ default y if S390
+ default y if SPARC64
+ default y if X86
+
+config CRC64
+ tristate
+ help
+ The CRC64 library functions. Select this if your module uses any of
+ the functions from <linux/crc64.h>.
+
+config CRC64_ARCH
+ bool
+ depends on CRC64 && CRC_OPTIMIZATIONS
+ default y if RISCV && RISCV_ISA_ZBC && 64BIT
+ default y if X86_64
+
+config CRC_OPTIMIZATIONS
+ bool "Enable optimized CRC implementations" if EXPERT
+ depends on !UML
+ default y
+ help
+ Disabling this option reduces code size slightly by disabling the
+ architecture-optimized implementations of any CRC variants that are
+ enabled. CRC checksumming performance may get much slower.
+
+ Keep this enabled unless you're really trying to minimize the size of
+ the kernel.
+
+config CRC_KUNIT_TEST
+ tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ select CRC7
+ select CRC16
+ select CRC_T10DIF
+ select CRC32
+ select CRC64
+ help
+ Unit tests for the CRC library functions.
+
+ This is intended to help people writing architecture-specific
+ optimized versions. If unsure, say N.
+
+config CRC_BENCHMARK
+ bool "Benchmark for the CRC functions"
+ depends on CRC_KUNIT_TEST
+ help
+ Include benchmarks in the KUnit test suite for the CRC functions.
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
new file mode 100644
index 000000000000..7543ad295ab6
--- /dev/null
+++ b/lib/crc/Makefile
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# Makefile for the kernel's cyclic redundancy check (CRC) library code
+
+obj-$(CONFIG_CRC4) += crc4.o
+obj-$(CONFIG_CRC7) += crc7.o
+obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC16) += crc16.o
+obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o
+obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o
+
+obj-$(CONFIG_CRC_T10DIF) += crc-t10dif.o
+crc-t10dif-y := crc-t10dif-main.o
+ifeq ($(CONFIG_CRC_T10DIF_ARCH),y)
+CFLAGS_crc-t10dif-main.o += -I$(src)/$(SRCARCH)
+crc-t10dif-$(CONFIG_ARM) += arm/crc-t10dif-core.o
+crc-t10dif-$(CONFIG_ARM64) += arm64/crc-t10dif-core.o
+crc-t10dif-$(CONFIG_PPC) += powerpc/crct10dif-vpmsum_asm.o
+crc-t10dif-$(CONFIG_RISCV) += riscv/crc16_msb.o
+crc-t10dif-$(CONFIG_X86) += x86/crc16-msb-pclmul.o
+endif
+
+obj-$(CONFIG_CRC32) += crc32.o
+crc32-y := crc32-main.o
+ifeq ($(CONFIG_CRC32_ARCH),y)
+CFLAGS_crc32-main.o += -I$(src)/$(SRCARCH)
+crc32-$(CONFIG_ARM) += arm/crc32-core.o
+crc32-$(CONFIG_ARM64) += arm64/crc32-core.o
+crc32-$(CONFIG_PPC) += powerpc/crc32c-vpmsum_asm.o
+crc32-$(CONFIG_RISCV) += riscv/crc32_lsb.o riscv/crc32_msb.o
+crc32-$(CONFIG_S390) += s390/crc32le-vx.o s390/crc32be-vx.o
+crc32-$(CONFIG_SPARC) += sparc/crc32c_asm.o
+crc32-$(CONFIG_X86) += x86/crc32-pclmul.o
+crc32-$(CONFIG_X86_64) += x86/crc32c-3way.o
+endif
+
+obj-$(CONFIG_CRC64) += crc64.o
+crc64-y := crc64-main.o
+ifeq ($(CONFIG_CRC64_ARCH),y)
+CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
+crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
+crc64-$(CONFIG_X86) += x86/crc64-pclmul.o
+endif
+
+obj-y += tests/
+
+hostprogs := gen_crc32table gen_crc64table
+clean-files := crc32table.h crc64table.h
+
+$(obj)/crc32-main.o: $(obj)/crc32table.h
+$(obj)/crc64-main.o: $(obj)/crc64table.h
+
+quiet_cmd_crc32 = GEN $@
+ cmd_crc32 = $< > $@
+
+quiet_cmd_crc64 = GEN $@
+ cmd_crc64 = $< > $@
+
+$(obj)/crc32table.h: $(obj)/gen_crc32table
+ $(call cmd,crc32)
+
+$(obj)/crc64table.h: $(obj)/gen_crc64table
+ $(call cmd,crc64)
diff --git a/lib/crc/arm/crc-t10dif-core.S b/lib/crc/arm/crc-t10dif-core.S
new file mode 100644
index 000000000000..2bbf2df9c1e2
--- /dev/null
+++ b/lib/crc/arm/crc-t10dif-core.S
@@ -0,0 +1,468 @@
+//
+// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+// Derived from the x86 version:
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+// Erdinc Ozturk <erdinc.ozturk@intel.com>
+// Vinodh Gopal <vinodh.gopal@intel.com>
+// James Guilford <james.guilford@intel.com>
+// Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses. You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the
+// distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Reference paper titled "Fast CRC Computation for Generic
+// Polynomials Using PCLMULQDQ Instruction"
+// URL: http://www.intel.com/content/dam/www/public/us/en/documents
+// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define CPU_LE(code...)
+#else
+#define CPU_LE(code...) code
+#endif
+
+ .text
+ .arch armv8-a
+ .fpu crypto-neon-fp-armv8
+
+ init_crc .req r0
+ buf .req r1
+ len .req r2
+
+ fold_consts_ptr .req ip
+
+ q0l .req d0
+ q0h .req d1
+ q1l .req d2
+ q1h .req d3
+ q2l .req d4
+ q2h .req d5
+ q3l .req d6
+ q3h .req d7
+ q4l .req d8
+ q4h .req d9
+ q5l .req d10
+ q5h .req d11
+ q6l .req d12
+ q6h .req d13
+ q7l .req d14
+ q7h .req d15
+ q8l .req d16
+ q8h .req d17
+ q9l .req d18
+ q9h .req d19
+ q10l .req d20
+ q10h .req d21
+ q11l .req d22
+ q11h .req d23
+ q12l .req d24
+ q12h .req d25
+
+ FOLD_CONSTS .req q10
+ FOLD_CONST_L .req q10l
+ FOLD_CONST_H .req q10h
+
+ /*
+ * Pairwise long polynomial multiplication of two 16-bit values
+ *
+ * { w0, w1 }, { y0, y1 }
+ *
+ * by two 64-bit values
+ *
+ * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+ *
+ * where each vector element is a byte, ordered from least to most
+ * significant. The resulting 80-bit vectors are XOR'ed together.
+ *
+ * This can be implemented using 8x8 long polynomial multiplication, by
+ * reorganizing the input so that each pairwise 8x8 multiplication
+ * produces one of the terms from the decomposition below, and
+ * combining the results of each rank and shifting them into place.
+ *
+ * Rank
+ * 0 w0*x0 ^ | y0*z0 ^
+ * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
+ * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
+ * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
+ * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
+ * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
+ * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
+ * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
+ * 8 w1*x7 << 64 | y1*z7 << 64
+ *
+ * The inputs can be reorganized into
+ *
+ * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+ * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+ *
+ * and after performing 8x8->16 bit long polynomial multiplication of
+ * each of the halves of the first vector with those of the second one,
+ * we obtain the following four vectors of 16-bit elements:
+ *
+ * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+ * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+ * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+ * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+ *
+ * Results b and c can be XORed together, as the vector elements have
+ * matching ranks. Then, the final XOR can be pulled forward, and
+ * applied between the halves of each of the remaining three vectors,
+ * which are then shifted into place, and XORed together to produce the
+ * final 80-bit result.
+ */
+ .macro pmull16x64_p8, v16, v64
+ vext.8 q11, \v64, \v64, #1
+ vld1.64 {q12}, [r4, :128]
+ vuzp.8 q11, \v64
+ vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
+ vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
+ bl __pmull16x64_p8
+ veor \v64, q12, q14
+ .endm
+
+__pmull16x64_p8:
+ vmull.p8 q13, d23, d24
+ vmull.p8 q14, d23, d25
+ vmull.p8 q15, d22, d24
+ vmull.p8 q12, d22, d25
+
+ veor q14, q14, q15
+ veor d24, d24, d25
+ veor d26, d26, d27
+ veor d28, d28, d29
+ vmov.i32 d25, #0
+ vmov.i32 d29, #0
+ vext.8 q12, q12, q12, #14
+ vext.8 q14, q14, q14, #15
+ veor d24, d24, d26
+ bx lr
+ENDPROC(__pmull16x64_p8)
+
+ .macro pmull16x64_p64, v16, v64
+ vmull.p64 q11, \v64\()l, \v16\()_L
+ vmull.p64 \v64, \v64\()h, \v16\()_H
+ veor \v64, \v64, q11
+ .endm
+
+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
+ // into reg1, reg2.
+ .macro fold_32_bytes, reg1, reg2, p
+ vld1.64 {q8-q9}, [buf]!
+
+ pmull16x64_\p FOLD_CONST, \reg1
+ pmull16x64_\p FOLD_CONST, \reg2
+
+CPU_LE( vrev64.8 q8, q8 )
+CPU_LE( vrev64.8 q9, q9 )
+ vswp q8l, q8h
+ vswp q9l, q9h
+
+ veor.8 \reg1, \reg1, q8
+ veor.8 \reg2, \reg2, q9
+ .endm
+
+ // Fold src_reg into dst_reg, optionally loading the next fold constants
+ .macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts
+ pmull16x64_\p FOLD_CONST, \src_reg
+ .ifnb \load_next_consts
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+ .endif
+ veor.8 \dst_reg, \dst_reg, \src_reg
+ .endm
+
+ .macro crct10dif, p
+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp len, #256
+ blt .Lless_than_256_bytes\@
+
+ mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts
+
+ // Load the first 128 data bytes. Byte swapping is necessary to make
+ // the bit order match the polynomial coefficient order.
+ vld1.64 {q0-q1}, [buf]!
+ vld1.64 {q2-q3}, [buf]!
+ vld1.64 {q4-q5}, [buf]!
+ vld1.64 {q6-q7}, [buf]!
+CPU_LE( vrev64.8 q0, q0 )
+CPU_LE( vrev64.8 q1, q1 )
+CPU_LE( vrev64.8 q2, q2 )
+CPU_LE( vrev64.8 q3, q3 )
+CPU_LE( vrev64.8 q4, q4 )
+CPU_LE( vrev64.8 q5, q5 )
+CPU_LE( vrev64.8 q6, q6 )
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q0l, q0h
+ vswp q1l, q1h
+ vswp q2l, q2h
+ vswp q3l, q3h
+ vswp q4l, q4h
+ vswp q5l, q5h
+ vswp q6l, q6h
+ vswp q7l, q7h
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ vmov.i8 q8h, #0
+ vmov.u16 q8h[3], init_crc
+ veor q0h, q0h, q8h
+
+ // Load the constants for folding across 128 bytes.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
+ // 128 to simplify the termination condition of the following loop.
+ sub len, len, #256
+
+ // While >= 128 data bytes remain (not counting q0-q7), fold the 128
+ // bytes q0-q7 into them, storing the result back into q0-q7.
+.Lfold_128_bytes_loop\@:
+ fold_32_bytes q0, q1, \p
+ fold_32_bytes q2, q3, \p
+ fold_32_bytes q4, q5, \p
+ fold_32_bytes q6, q7, \p
+ subs len, len, #128
+ bge .Lfold_128_bytes_loop\@
+
+ // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
+
+ // Fold across 64 bytes.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+ fold_16_bytes q0, q4, \p
+ fold_16_bytes q1, q5, \p
+ fold_16_bytes q2, q6, \p
+ fold_16_bytes q3, q7, \p, 1
+ // Fold across 32 bytes.
+ fold_16_bytes q4, q6, \p
+ fold_16_bytes q5, q7, \p, 1
+ // Fold across 16 bytes.
+ fold_16_bytes q6, q7, \p
+
+ // Add 128 to get the correct number of data bytes remaining in 0...127
+ // (not counting q7), following the previous extra subtraction by 128.
+ // Then subtract 16 to simplify the termination condition of the
+ // following loop.
+ adds len, len, #(128-16)
+
+ // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
+ // into them, storing the result back into q7.
+ blt .Lfold_16_bytes_loop_done\@
+.Lfold_16_bytes_loop\@:
+ pmull16x64_\p FOLD_CONST, q7
+ vld1.64 {q0}, [buf]!
+CPU_LE( vrev64.8 q0, q0 )
+ vswp q0l, q0h
+ veor.8 q7, q7, q0
+ subs len, len, #16
+ bge .Lfold_16_bytes_loop\@
+
+.Lfold_16_bytes_loop_done\@:
+ // Add 16 to get the correct number of data bytes remaining in 0...15
+ // (not counting q7), following the previous extra subtraction by 16.
+ adds len, len, #16
+ beq .Lreduce_final_16_bytes\@
+
+.Lhandle_partial_segment\@:
+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+ // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
+ // do this without needing a fold constant for each possible 'len',
+ // redivide the bytes into a first chunk of 'len' bytes and a second
+ // chunk of 16 bytes, then fold the first chunk into the second.
+
+ // q0 = last 16 original data bytes
+ add buf, buf, len
+ sub buf, buf, #16
+ vld1.64 {q0}, [buf]
+CPU_LE( vrev64.8 q0, q0 )
+ vswp q0l, q0h
+
+ // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
+ mov_l r1, .Lbyteshift_table + 16
+ sub r1, r1, len
+ vld1.8 {q2}, [r1]
+ vtbl.8 q1l, {q7l-q7h}, q2l
+ vtbl.8 q1h, {q7l-q7h}, q2h
+
+ // q3 = first chunk: q7 right-shifted by '16-len' bytes.
+ vmov.i8 q3, #0x80
+ veor.8 q2, q2, q3
+ vtbl.8 q3l, {q7l-q7h}, q2l
+ vtbl.8 q3h, {q7l-q7h}, q2h
+
+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+ vshr.s8 q2, q2, #7
+
+ // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
+ // then '16-len' bytes from q1 (high-order bytes).
+ vbsl.8 q2, q1, q0
+
+ // Fold the first chunk into the second chunk, storing the result in q7.
+ pmull16x64_\p FOLD_CONST, q3
+ veor.8 q7, q3, q2
+ b .Lreduce_final_16_bytes\@
+
+.Lless_than_256_bytes\@:
+ // Checksumming a buffer of length 16...255 bytes
+
+ mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+ // Load the first 16 data bytes.
+ vld1.64 {q7}, [buf]!
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q7l, q7h
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ vmov.i8 q0h, #0
+ vmov.u16 q0h[3], init_crc
+ veor.8 q7h, q7h, q0h
+
+ // Load the fold-across-16-bytes constants.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+ cmp len, #16
+ beq .Lreduce_final_16_bytes\@ // len == 16
+ subs len, len, #32
+ addlt len, len, #16
+ blt .Lhandle_partial_segment\@ // 17 <= len <= 31
+ b .Lfold_16_bytes_loop\@ // 32 <= len <= 255
+
+.Lreduce_final_16_bytes\@:
+ .endm
+
+//
+// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+ENTRY(crc_t10dif_pmull64)
+ crct10dif p64
+
+ // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
+
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ // whose low 48 bits are 0.
+ vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x))
+ veor.8 q0h, q0h, q7l // + low bits * x^64
+
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ vmov.i8 q1, #0
+ vmov s4, s3 // extract high 32 bits
+ vmov s3, s5 // zero high 32 bits
+ vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x))
+ veor.8 q0, q0, q1 // + low bits
+
+ // Load G(x) and floor(x^48 / G(x)).
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]
+
+ // Use Barrett reduction to compute the final CRC value.
+ vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x))
+ vshr.u64 q1l, q1l, #32 // /= x^32
+ vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x)
+ vshr.u64 q0l, q0l, #48
+ veor.8 q0l, q0l, q1l // + low 16 nonzero bits
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
+
+ vmov.u16 r0, q0l[0]
+ bx lr
+ENDPROC(crc_t10dif_pmull64)
+
+ENTRY(crc_t10dif_pmull8)
+ push {r4, lr}
+ mov_l r4, .L16x64perm
+
+ crct10dif p8
+
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q7l, q7h
+ vst1.64 {q7}, [r3, :128]
+ pop {r4, pc}
+ENDPROC(crc_t10dif_pmull8)
+
+ .section ".rodata", "a"
+ .align 4
+
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
+// .Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
+// .Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d // x^(2*128) mod G(x)
+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
+// .Lfinal_fold_consts:
+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 // G(x)
+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+ .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+ .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
+
+.L16x64perm:
+ .quad 0x808080800000000, 0x909090901010101
diff --git a/lib/crc/arm/crc-t10dif.h b/lib/crc/arm/crc-t10dif.h
new file mode 100644
index 000000000000..afc0ebf97f19
--- /dev/null
+++ b/lib/crc/arm/crc-t10dif.h
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
+
+asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
+ u8 out[16]);
+
+static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
+{
+ if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && likely(may_use_simd())) {
+ if (static_branch_likely(&have_pmull)) {
+ scoped_ksimd()
+ return crc_t10dif_pmull64(crc, data, length);
+ } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
+ static_branch_likely(&have_neon)) {
+ u8 buf[16] __aligned(16);
+
+ scoped_ksimd()
+ crc_t10dif_pmull8(crc, data, length, buf);
+
+ return crc_t10dif_generic(0, buf, sizeof(buf));
+ }
+ }
+ return crc_t10dif_generic(crc, data, length);
+}
+
+#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
+static void crc_t10dif_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON) {
+ static_branch_enable(&have_neon);
+ if (elf_hwcap2 & HWCAP2_PMULL)
+ static_branch_enable(&have_pmull);
+ }
+}
diff --git a/lib/crc/arm/crc32-core.S b/lib/crc/arm/crc32-core.S
new file mode 100644
index 000000000000..6f674f30c70b
--- /dev/null
+++ b/lib/crc/arm/crc32-core.S
@@ -0,0 +1,306 @@
+/*
+ * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * https://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ * Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .align 6
+ .arch armv8-a
+ .arch_extension crc
+ .fpu crypto-neon-fp-armv8
+
+.Lcrc32_constants:
+ /*
+ * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
+ * #define CONSTANT_R1 0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
+ * #define CONSTANT_R2 0x1c6e41596LL
+ */
+ .quad 0x0000000154442bd4
+ .quad 0x00000001c6e41596
+
+ /*
+ * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
+ * #define CONSTANT_R3 0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
+ * #define CONSTANT_R4 0x0ccaa009eLL
+ */
+ .quad 0x00000001751997d0
+ .quad 0x00000000ccaa009e
+
+ /*
+ * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
+ * #define CONSTANT_R5 0x163cd6124LL
+ */
+ .quad 0x0000000163cd6124
+ .quad 0x00000000FFFFFFFF
+
+ /*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
+ * = 0x1F7011641LL
+ * #define CONSTANT_RU 0x1F7011641LL
+ */
+ .quad 0x00000001DB710641
+ .quad 0x00000001F7011641
+
+.Lcrc32c_constants:
+ .quad 0x00000000740eef02
+ .quad 0x000000009e4addf8
+ .quad 0x00000000f20c0dfe
+ .quad 0x000000014cd00bd6
+ .quad 0x00000000dd45aab8
+ .quad 0x00000000FFFFFFFF
+ .quad 0x0000000105ec76f0
+ .quad 0x00000000dea713f1
+
+ dCONSTANTl .req d0
+ dCONSTANTh .req d1
+ qCONSTANT .req q0
+
+ BUF .req r0
+ LEN .req r1
+ CRC .req r2
+
+ qzr .req q9
+
+ /**
+ * Calculate crc32
+ * BUF - buffer
+ * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
+ * CRC - initial crc32
+ * return %eax crc32
+ * uint crc32_pmull_le(unsigned char const *buffer,
+ * size_t len, uint crc32)
+ */
+SYM_FUNC_START(crc32_pmull_le)
+ adr r3, .Lcrc32_constants
+ b 0f
+SYM_FUNC_END(crc32_pmull_le)
+
+SYM_FUNC_START(crc32c_pmull_le)
+ adr r3, .Lcrc32c_constants
+
+0: bic LEN, LEN, #15
+ vld1.8 {q1-q2}, [BUF, :128]!
+ vld1.8 {q3-q4}, [BUF, :128]!
+ vmov.i8 qzr, #0
+ vmov.i8 qCONSTANT, #0
+ vmov.32 dCONSTANTl[0], CRC
+ veor.8 d2, d2, dCONSTANTl
+ sub LEN, LEN, #0x40
+ cmp LEN, #0x40
+ blt less_64
+
+ vld1.64 {qCONSTANT}, [r3]
+
+loop_64: /* 64 bytes Full cache line folding */
+ sub LEN, LEN, #0x40
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q6, d5, dCONSTANTh
+ vmull.p64 q7, d7, dCONSTANTh
+ vmull.p64 q8, d9, dCONSTANTh
+
+ vmull.p64 q1, d2, dCONSTANTl
+ vmull.p64 q2, d4, dCONSTANTl
+ vmull.p64 q3, d6, dCONSTANTl
+ vmull.p64 q4, d8, dCONSTANTl
+
+ veor.8 q1, q1, q5
+ vld1.8 {q5}, [BUF, :128]!
+ veor.8 q2, q2, q6
+ vld1.8 {q6}, [BUF, :128]!
+ veor.8 q3, q3, q7
+ vld1.8 {q7}, [BUF, :128]!
+ veor.8 q4, q4, q8
+ vld1.8 {q8}, [BUF, :128]!
+
+ veor.8 q1, q1, q5
+ veor.8 q2, q2, q6
+ veor.8 q3, q3, q7
+ veor.8 q4, q4, q8
+
+ cmp LEN, #0x40
+ bge loop_64
+
+less_64: /* Folding cache line into 128bit */
+ vldr dCONSTANTl, [r3, #16]
+ vldr dCONSTANTh, [r3, #24]
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q2
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q3
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q4
+
+ teq LEN, #0
+ beq fold_64
+
+loop_16: /* Folding rest buffer into 128bit */
+ subs LEN, LEN, #0x10
+
+ vld1.8 {q2}, [BUF, :128]!
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q2
+
+ bne loop_16
+
+fold_64:
+ /* perform the last 64 bit fold, also adds 32 zeroes
+ * to the input stream */
+ vmull.p64 q2, d2, dCONSTANTh
+ vext.8 q1, q1, qzr, #8
+ veor.8 q1, q1, q2
+
+ /* final 32-bit fold */
+ vldr dCONSTANTl, [r3, #32]
+ vldr d6, [r3, #40]
+ vmov.i8 d7, #0
+
+ vext.8 q2, q1, qzr, #4
+ vand.8 d2, d2, d6
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q2
+
+ /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+ vldr dCONSTANTl, [r3, #48]
+ vldr dCONSTANTh, [r3, #56]
+
+ vand.8 q2, q1, q3
+ vext.8 q2, qzr, q2, #8
+ vmull.p64 q2, d5, dCONSTANTh
+ vand.8 q2, q2, q3
+ vmull.p64 q2, d4, dCONSTANTl
+ veor.8 q1, q1, q2
+ vmov r0, s5
+
+ bx lr
+SYM_FUNC_END(crc32c_pmull_le)
+
+ .macro __crc32, c
+ subs ip, r2, #8
+ bmi .Ltail\c
+
+ tst r1, #3
+ bne .Lunaligned\c
+
+ teq ip, #0
+.Laligned8\c:
+ ldrd r2, r3, [r1], #8
+ARM_BE8(rev r2, r2 )
+ARM_BE8(rev r3, r3 )
+ crc32\c\()w r0, r0, r2
+ crc32\c\()w r0, r0, r3
+ bxeq lr
+ subs ip, ip, #8
+ bpl .Laligned8\c
+
+.Ltail\c:
+ tst ip, #4
+ beq 2f
+ ldr r3, [r1], #4
+ARM_BE8(rev r3, r3 )
+ crc32\c\()w r0, r0, r3
+
+2: tst ip, #2
+ beq 1f
+ ldrh r3, [r1], #2
+ARM_BE8(rev16 r3, r3 )
+ crc32\c\()h r0, r0, r3
+
+1: tst ip, #1
+ bxeq lr
+ ldrb r3, [r1]
+ crc32\c\()b r0, r0, r3
+ bx lr
+
+.Lunaligned\c:
+ tst r1, #1
+ beq 2f
+ ldrb r3, [r1], #1
+ subs r2, r2, #1
+ crc32\c\()b r0, r0, r3
+
+ tst r1, #2
+ beq 0f
+2: ldrh r3, [r1], #2
+ subs r2, r2, #2
+ARM_BE8(rev16 r3, r3 )
+ crc32\c\()h r0, r0, r3
+
+0: subs ip, r2, #8
+ bpl .Laligned8\c
+ b .Ltail\c
+ .endm
+
+ .align 5
+SYM_FUNC_START(crc32_armv8_le)
+ __crc32
+SYM_FUNC_END(crc32_armv8_le)
+
+ .align 5
+SYM_FUNC_START(crc32c_armv8_le)
+ __crc32 c
+SYM_FUNC_END(crc32c_armv8_le)
diff --git a/lib/crc/arm/crc32.h b/lib/crc/arm/crc32.h
new file mode 100644
index 000000000000..f33de6b22cd4
--- /dev/null
+++ b/lib/crc/arm/crc32.h
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+#define PMULL_MIN_LEN 64 /* min size of buffer for pmull functions */
+
+asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc);
+asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len);
+
+asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc);
+asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len);
+
+static inline u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len)
+{
+ if (static_branch_likely(&have_crc32))
+ return crc32_armv8_le(crc, p, len);
+ return crc32_le_base(crc, p, len);
+}
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (len >= PMULL_MIN_LEN + 15 &&
+ static_branch_likely(&have_pmull) && likely(may_use_simd())) {
+ size_t n = -(uintptr_t)p & 15;
+
+ /* align p to 16-byte boundary */
+ if (n) {
+ crc = crc32_le_scalar(crc, p, n);
+ p += n;
+ len -= n;
+ }
+ n = round_down(len, 16);
+ scoped_ksimd()
+ crc = crc32_pmull_le(p, n, crc);
+ p += n;
+ len -= n;
+ }
+ return crc32_le_scalar(crc, p, len);
+}
+
+static inline u32 crc32c_scalar(u32 crc, const u8 *p, size_t len)
+{
+ if (static_branch_likely(&have_crc32))
+ return crc32c_armv8_le(crc, p, len);
+ return crc32c_base(crc, p, len);
+}
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (len >= PMULL_MIN_LEN + 15 &&
+ static_branch_likely(&have_pmull) && likely(may_use_simd())) {
+ size_t n = -(uintptr_t)p & 15;
+
+ /* align p to 16-byte boundary */
+ if (n) {
+ crc = crc32c_scalar(crc, p, n);
+ p += n;
+ len -= n;
+ }
+ n = round_down(len, 16);
+ scoped_ksimd()
+ crc = crc32c_pmull_le(p, n, crc);
+ p += n;
+ len -= n;
+ }
+ return crc32c_scalar(crc, p, len);
+}
+
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static void crc32_mod_init_arch(void)
+{
+ if (elf_hwcap2 & HWCAP2_CRC32)
+ static_branch_enable(&have_crc32);
+ if (elf_hwcap2 & HWCAP2_PMULL)
+ static_branch_enable(&have_pmull);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (elf_hwcap2 & (HWCAP2_CRC32 | HWCAP2_PMULL))
+ return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/arm64/crc-t10dif-core.S b/lib/crc/arm64/crc-t10dif-core.S
new file mode 100644
index 000000000000..87dd6d46224d
--- /dev/null
+++ b/lib/crc/arm64/crc-t10dif-core.S
@@ -0,0 +1,469 @@
+//
+// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+// Eric Biggers <ebiggers@google.com>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+// Derived from the x86 version:
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+// Erdinc Ozturk <erdinc.ozturk@intel.com>
+// Vinodh Gopal <vinodh.gopal@intel.com>
+// James Guilford <james.guilford@intel.com>
+// Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses. You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the
+// distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Reference paper titled "Fast CRC Computation for Generic
+// Polynomials Using PCLMULQDQ Instruction"
+// URL: http://www.intel.com/content/dam/www/public/us/en/documents
+// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .arch armv8-a+crypto
+
+ init_crc .req w0
+ buf .req x1
+ len .req x2
+ fold_consts_ptr .req x5
+
+ fold_consts .req v10
+
+ t3 .req v17
+ t4 .req v18
+ t5 .req v19
+ t6 .req v20
+ t7 .req v21
+ t8 .req v22
+
+ perm .req v27
+
+ .macro pmull16x64_p64, a16, b64, c64
+ pmull2 \c64\().1q, \a16\().2d, \b64\().2d
+ pmull \b64\().1q, \a16\().1d, \b64\().1d
+ .endm
+
+ /*
+ * Pairwise long polynomial multiplication of two 16-bit values
+ *
+ * { w0, w1 }, { y0, y1 }
+ *
+ * by two 64-bit values
+ *
+ * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+ *
+ * where each vector element is a byte, ordered from least to most
+ * significant.
+ *
+ * This can be implemented using 8x8 long polynomial multiplication, by
+ * reorganizing the input so that each pairwise 8x8 multiplication
+ * produces one of the terms from the decomposition below, and
+ * combining the results of each rank and shifting them into place.
+ *
+ * Rank
+ * 0 w0*x0 ^ | y0*z0 ^
+ * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
+ * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
+ * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
+ * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
+ * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
+ * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
+ * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
+ * 8 w1*x7 << 64 | y1*z7 << 64
+ *
+ * The inputs can be reorganized into
+ *
+ * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+ * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+ *
+ * and after performing 8x8->16 bit long polynomial multiplication of
+ * each of the halves of the first vector with those of the second one,
+ * we obtain the following four vectors of 16-bit elements:
+ *
+ * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+ * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+ * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+ * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+ *
+ * Results b and c can be XORed together, as the vector elements have
+ * matching ranks. Then, the final XOR (*) can be pulled forward, and
+ * applied between the halves of each of the remaining three vectors,
+ * which are then shifted into place, and combined to produce two
+ * 80-bit results.
+ *
+ * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
+ * to the 64x64 bit one above, but XOR'ing the outputs together will
+ * produce the expected result, and this is sufficient in the context of
+ * this algorithm.
+ */
+ .macro pmull16x64_p8, a16, b64, c64
+ ext t7.16b, \b64\().16b, \b64\().16b, #1
+ tbl t5.16b, {\a16\().16b}, perm.16b
+ uzp1 t7.16b, \b64\().16b, t7.16b
+ bl __pmull_p8_16x64
+ ext \b64\().16b, t4.16b, t4.16b, #15
+ eor \c64\().16b, t8.16b, t5.16b
+ .endm
+
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+ ext t6.16b, t5.16b, t5.16b, #8
+
+ pmull t3.8h, t7.8b, t5.8b
+ pmull t4.8h, t7.8b, t6.8b
+ pmull2 t5.8h, t7.16b, t5.16b
+ pmull2 t6.8h, t7.16b, t6.16b
+
+ ext t8.16b, t3.16b, t3.16b, #8
+ eor t4.16b, t4.16b, t6.16b
+ ext t7.16b, t5.16b, t5.16b, #8
+ ext t6.16b, t4.16b, t4.16b, #8
+ eor t8.8b, t8.8b, t3.8b
+ eor t5.8b, t5.8b, t7.8b
+ eor t4.8b, t4.8b, t6.8b
+ ext t5.16b, t5.16b, t5.16b, #14
+ ret
+SYM_FUNC_END(__pmull_p8_16x64)
+
+
+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
+ // into reg1, reg2.
+ .macro fold_32_bytes, p, reg1, reg2
+ ldp q11, q12, [buf], #0x20
+
+ pmull16x64_\p fold_consts, \reg1, v8
+
+CPU_LE( rev64 v11.16b, v11.16b )
+CPU_LE( rev64 v12.16b, v12.16b )
+
+ pmull16x64_\p fold_consts, \reg2, v9
+
+CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
+CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
+
+ eor \reg1\().16b, \reg1\().16b, v8.16b
+ eor \reg2\().16b, \reg2\().16b, v9.16b
+ eor \reg1\().16b, \reg1\().16b, v11.16b
+ eor \reg2\().16b, \reg2\().16b, v12.16b
+ .endm
+
+ // Fold src_reg into dst_reg, optionally loading the next fold constants
+ .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
+ pmull16x64_\p fold_consts, \src_reg, v8
+ .ifnb \load_next_consts
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ .endif
+ eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
+ eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
+ .endm
+
+ .macro crc_t10dif_pmull, p
+
+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp len, #256
+ b.lt .Lless_than_256_bytes_\@
+
+ adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
+
+ // Load the first 128 data bytes. Byte swapping is necessary to make
+ // the bit order match the polynomial coefficient order.
+ ldp q0, q1, [buf]
+ ldp q2, q3, [buf, #0x20]
+ ldp q4, q5, [buf, #0x40]
+ ldp q6, q7, [buf, #0x60]
+ add buf, buf, #0x80
+CPU_LE( rev64 v0.16b, v0.16b )
+CPU_LE( rev64 v1.16b, v1.16b )
+CPU_LE( rev64 v2.16b, v2.16b )
+CPU_LE( rev64 v3.16b, v3.16b )
+CPU_LE( rev64 v4.16b, v4.16b )
+CPU_LE( rev64 v5.16b, v5.16b )
+CPU_LE( rev64 v6.16b, v6.16b )
+CPU_LE( rev64 v7.16b, v7.16b )
+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
+CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
+CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
+CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
+CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
+CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
+CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ movi v8.16b, #0
+ mov v8.h[7], init_crc
+ eor v0.16b, v0.16b, v8.16b
+
+ // Load the constants for folding across 128 bytes.
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
+
+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
+ // 128 to simplify the termination condition of the following loop.
+ sub len, len, #256
+
+ // While >= 128 data bytes remain (not counting v0-v7), fold the 128
+ // bytes v0-v7 into them, storing the result back into v0-v7.
+.Lfold_128_bytes_loop_\@:
+ fold_32_bytes \p, v0, v1
+ fold_32_bytes \p, v2, v3
+ fold_32_bytes \p, v4, v5
+ fold_32_bytes \p, v6, v7
+
+ subs len, len, #128
+ b.ge .Lfold_128_bytes_loop_\@
+
+ // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
+
+ // Fold across 64 bytes.
+ add fold_consts_ptr, fold_consts_ptr, #16
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ fold_16_bytes \p, v0, v4
+ fold_16_bytes \p, v1, v5
+ fold_16_bytes \p, v2, v6
+ fold_16_bytes \p, v3, v7, 1
+ // Fold across 32 bytes.
+ fold_16_bytes \p, v4, v6
+ fold_16_bytes \p, v5, v7, 1
+ // Fold across 16 bytes.
+ fold_16_bytes \p, v6, v7
+
+ // Add 128 to get the correct number of data bytes remaining in 0...127
+ // (not counting v7), following the previous extra subtraction by 128.
+ // Then subtract 16 to simplify the termination condition of the
+ // following loop.
+ adds len, len, #(128-16)
+
+ // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
+ // into them, storing the result back into v7.
+ b.lt .Lfold_16_bytes_loop_done_\@
+.Lfold_16_bytes_loop_\@:
+ pmull16x64_\p fold_consts, v7, v8
+ eor v7.16b, v7.16b, v8.16b
+ ldr q0, [buf], #16
+CPU_LE( rev64 v0.16b, v0.16b )
+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
+ eor v7.16b, v7.16b, v0.16b
+ subs len, len, #16
+ b.ge .Lfold_16_bytes_loop_\@
+
+.Lfold_16_bytes_loop_done_\@:
+ // Add 16 to get the correct number of data bytes remaining in 0...15
+ // (not counting v7), following the previous extra subtraction by 16.
+ adds len, len, #16
+ b.eq .Lreduce_final_16_bytes_\@
+
+.Lhandle_partial_segment_\@:
+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+ // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To
+ // do this without needing a fold constant for each possible 'len',
+ // redivide the bytes into a first chunk of 'len' bytes and a second
+ // chunk of 16 bytes, then fold the first chunk into the second.
+
+ // v0 = last 16 original data bytes
+ add buf, buf, len
+ ldr q0, [buf, #-16]
+CPU_LE( rev64 v0.16b, v0.16b )
+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
+
+ // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
+ adr_l x4, .Lbyteshift_table + 16
+ sub x4, x4, len
+ ld1 {v2.16b}, [x4]
+ tbl v1.16b, {v7.16b}, v2.16b
+
+ // v3 = first chunk: v7 right-shifted by '16-len' bytes.
+ movi v3.16b, #0x80
+ eor v2.16b, v2.16b, v3.16b
+ tbl v3.16b, {v7.16b}, v2.16b
+
+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+ sshr v2.16b, v2.16b, #7
+
+ // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
+ // then '16-len' bytes from v1 (high-order bytes).
+ bsl v2.16b, v1.16b, v0.16b
+
+ // Fold the first chunk into the second chunk, storing the result in v7.
+ pmull16x64_\p fold_consts, v3, v0
+ eor v7.16b, v3.16b, v0.16b
+ eor v7.16b, v7.16b, v2.16b
+ b .Lreduce_final_16_bytes_\@
+
+.Lless_than_256_bytes_\@:
+ // Checksumming a buffer of length 16...255 bytes
+
+ adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+ // Load the first 16 data bytes.
+ ldr q7, [buf], #0x10
+CPU_LE( rev64 v7.16b, v7.16b )
+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ movi v0.16b, #0
+ mov v0.h[7], init_crc
+ eor v7.16b, v7.16b, v0.16b
+
+ // Load the fold-across-16-bytes constants.
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+
+ cmp len, #16
+ b.eq .Lreduce_final_16_bytes_\@ // len == 16
+ subs len, len, #32
+ b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
+ add len, len, #16
+ b .Lhandle_partial_segment_\@ // 17 <= len <= 31
+
+.Lreduce_final_16_bytes_\@:
+ .endm
+
+//
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p8)
+ frame_push 1
+
+ // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+ movi perm.4h, #8, lsl #8
+ orr perm.2s, #1, lsl #16
+ orr perm.2s, #1, lsl #24
+ zip1 perm.16b, perm.16b, perm.16b
+ zip1 perm.16b, perm.16b, perm.16b
+
+ crc_t10dif_pmull p8
+
+CPU_LE( rev64 v7.16b, v7.16b )
+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
+ str q7, [x3]
+
+ frame_pop
+ ret
+SYM_FUNC_END(crc_t10dif_pmull_p8)
+
+ .align 5
+//
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p64)
+ crc_t10dif_pmull p64
+
+ // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+ movi v2.16b, #0 // init zero register
+
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ // whose low 48 bits are 0.
+ ext v0.16b, v2.16b, v7.16b, #8
+ pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x))
+ eor v0.16b, v0.16b, v7.16b // + low bits * x^64
+
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
+ mov v0.s[3], v2.s[0] // zero high 32 bits
+ pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x))
+ eor v0.16b, v0.16b, v1.16b // + low bits
+
+ // Load G(x) and floor(x^48 / G(x)).
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
+
+ // Use Barrett reduction to compute the final CRC value.
+ pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x))
+ ushr v1.2d, v1.2d, #32 // /= x^32
+ pmull v1.1q, v1.1d, fold_consts.1d // *= G(x)
+ ushr v0.2d, v0.2d, #48
+ eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+ umov w0, v0.h[0]
+ ret
+SYM_FUNC_END(crc_t10dif_pmull_p64)
+
+ .section ".rodata", "a"
+ .align 4
+
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
+// .Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
+// .Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d // x^(2*128) mod G(x)
+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
+// .Lfinal_fold_consts:
+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 // G(x)
+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+ .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+ .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
diff --git a/lib/crc/arm64/crc-t10dif.h b/lib/crc/arm64/crc-t10dif.h
new file mode 100644
index 000000000000..b8338139ed77
--- /dev/null
+++ b/lib/crc/arm64/crc-t10dif.h
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
+
+asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
+ u8 out[16]);
+asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+
+static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
+{
+ if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && likely(may_use_simd())) {
+ if (static_branch_likely(&have_pmull)) {
+ scoped_ksimd()
+ return crc_t10dif_pmull_p64(crc, data, length);
+ } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
+ static_branch_likely(&have_asimd)) {
+ u8 buf[16];
+
+ scoped_ksimd()
+ crc_t10dif_pmull_p8(crc, data, length, buf);
+
+ return crc_t10dif_generic(0, buf, sizeof(buf));
+ }
+ }
+ return crc_t10dif_generic(crc, data, length);
+}
+
+#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
+static void crc_t10dif_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(ASIMD)) {
+ static_branch_enable(&have_asimd);
+ if (cpu_have_named_feature(PMULL))
+ static_branch_enable(&have_pmull);
+ }
+}
diff --git a/lib/crc/arm64/crc32-core.S b/lib/crc/arm64/crc32-core.S
new file mode 100644
index 000000000000..68825317460f
--- /dev/null
+++ b/lib/crc/arm64/crc32-core.S
@@ -0,0 +1,362 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
+ *
+ * Copyright (C) 2016 - 2018 Linaro Ltd.
+ * Copyright (C) 2024 Google LLC
+ *
+ * Author: Ard Biesheuvel <ardb@kernel.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .cpu generic+crc+crypto
+
+ .macro bitle, reg
+ .endm
+
+ .macro bitbe, reg
+ rbit \reg, \reg
+ .endm
+
+ .macro bytele, reg
+ .endm
+
+ .macro bytebe, reg
+ rbit \reg, \reg
+ lsr \reg, \reg, #24
+ .endm
+
+ .macro hwordle, reg
+CPU_BE( rev16 \reg, \reg )
+ .endm
+
+ .macro hwordbe, reg
+CPU_LE( rev \reg, \reg )
+ rbit \reg, \reg
+CPU_BE( lsr \reg, \reg, #16 )
+ .endm
+
+ .macro le, regs:vararg
+ .irp r, \regs
+CPU_BE( rev \r, \r )
+ .endr
+ .endm
+
+ .macro be, regs:vararg
+ .irp r, \regs
+CPU_LE( rev \r, \r )
+ .endr
+ .irp r, \regs
+ rbit \r, \r
+ .endr
+ .endm
+
+ .macro __crc32, c, order=le
+ bit\order w0
+ cmp x2, #16
+ b.lt 8f // less than 16 bytes
+
+ and x7, x2, #0x1f
+ and x2, x2, #~0x1f
+ cbz x7, 32f // multiple of 32 bytes
+
+ and x8, x7, #0xf
+ ldp x3, x4, [x1]
+ add x8, x8, x1
+ add x1, x1, x7
+ ldp x5, x6, [x8]
+ \order x3, x4, x5, x6
+
+ tst x7, #8
+ crc32\c\()x w8, w0, x3
+ csel x3, x3, x4, eq
+ csel w0, w0, w8, eq
+ tst x7, #4
+ lsr x4, x3, #32
+ crc32\c\()w w8, w0, w3
+ csel x3, x3, x4, eq
+ csel w0, w0, w8, eq
+ tst x7, #2
+ lsr w4, w3, #16
+ crc32\c\()h w8, w0, w3
+ csel w3, w3, w4, eq
+ csel w0, w0, w8, eq
+ tst x7, #1
+ crc32\c\()b w8, w0, w3
+ csel w0, w0, w8, eq
+ tst x7, #16
+ crc32\c\()x w8, w0, x5
+ crc32\c\()x w8, w8, x6
+ csel w0, w0, w8, eq
+ cbz x2, 0f
+
+32: ldp x3, x4, [x1], #32
+ sub x2, x2, #32
+ ldp x5, x6, [x1, #-16]
+ \order x3, x4, x5, x6
+ crc32\c\()x w0, w0, x3
+ crc32\c\()x w0, w0, x4
+ crc32\c\()x w0, w0, x5
+ crc32\c\()x w0, w0, x6
+ cbnz x2, 32b
+0: bit\order w0
+ ret
+
+8: tbz x2, #3, 4f
+ ldr x3, [x1], #8
+ \order x3
+ crc32\c\()x w0, w0, x3
+4: tbz x2, #2, 2f
+ ldr w3, [x1], #4
+ \order w3
+ crc32\c\()w w0, w0, w3
+2: tbz x2, #1, 1f
+ ldrh w3, [x1], #2
+ hword\order w3
+ crc32\c\()h w0, w0, w3
+1: tbz x2, #0, 0f
+ ldrb w3, [x1]
+ byte\order w3
+ crc32\c\()b w0, w0, w3
+0: bit\order w0
+ ret
+ .endm
+
+ .align 5
+SYM_FUNC_START(crc32_le_arm64)
+ __crc32
+SYM_FUNC_END(crc32_le_arm64)
+
+ .align 5
+SYM_FUNC_START(crc32c_le_arm64)
+ __crc32 c
+SYM_FUNC_END(crc32c_le_arm64)
+
+ .align 5
+SYM_FUNC_START(crc32_be_arm64)
+ __crc32 order=be
+SYM_FUNC_END(crc32_be_arm64)
+
+ in .req x1
+ len .req x2
+
+ /*
+ * w0: input CRC at entry, output CRC at exit
+ * x1: pointer to input buffer
+ * x2: length of input in bytes
+ */
+ .macro crc4way, insn, table, order=le
+ bit\order w0
+ lsr len, len, #6 // len := # of 64-byte blocks
+
+ /* Process up to 64 blocks of 64 bytes at a time */
+.La\@: mov x3, #64
+ cmp len, #64
+ csel x3, x3, len, hi // x3 := min(len, 64)
+ sub len, len, x3
+
+ /* Divide the input into 4 contiguous blocks */
+ add x4, x3, x3, lsl #1 // x4 := 3 * x3
+ add x7, in, x3, lsl #4 // x7 := in + 16 * x3
+ add x8, in, x3, lsl #5 // x8 := in + 32 * x3
+ add x9, in, x4, lsl #4 // x9 := in + 16 * x4
+
+ /* Load the folding coefficients from the lookup table */
+ adr_l x5, \table - 12 // entry 0 omitted
+ add x5, x5, x4, lsl #2 // x5 += 12 * x3
+ ldp s0, s1, [x5]
+ ldr s2, [x5, #8]
+
+ /* Zero init partial CRCs for this iteration */
+ mov w4, wzr
+ mov w5, wzr
+ mov w6, wzr
+ mov x17, xzr
+
+.Lb\@: sub x3, x3, #1
+ \insn w6, w6, x17
+ ldp x10, x11, [in], #16
+ ldp x12, x13, [x7], #16
+ ldp x14, x15, [x8], #16
+ ldp x16, x17, [x9], #16
+
+ \order x10, x11, x12, x13, x14, x15, x16, x17
+
+ /* Apply the CRC transform to 4 16-byte blocks in parallel */
+ \insn w0, w0, x10
+ \insn w4, w4, x12
+ \insn w5, w5, x14
+ \insn w6, w6, x16
+ \insn w0, w0, x11
+ \insn w4, w4, x13
+ \insn w5, w5, x15
+ cbnz x3, .Lb\@
+
+ /* Combine the 4 partial results into w0 */
+ mov v3.d[0], x0
+ mov v4.d[0], x4
+ mov v5.d[0], x5
+ pmull v0.1q, v0.1d, v3.1d
+ pmull v1.1q, v1.1d, v4.1d
+ pmull v2.1q, v2.1d, v5.1d
+ eor v0.8b, v0.8b, v1.8b
+ eor v0.8b, v0.8b, v2.8b
+ mov x5, v0.d[0]
+ eor x5, x5, x17
+ \insn w0, w6, x5
+
+ mov in, x9
+ cbnz len, .La\@
+
+ bit\order w0
+ ret
+ .endm
+
+ .align 5
+SYM_FUNC_START(crc32c_le_arm64_4way)
+ crc4way crc32cx, .L0
+SYM_FUNC_END(crc32c_le_arm64_4way)
+
+ .align 5
+SYM_FUNC_START(crc32_le_arm64_4way)
+ crc4way crc32x, .L1
+SYM_FUNC_END(crc32_le_arm64_4way)
+
+ .align 5
+SYM_FUNC_START(crc32_be_arm64_4way)
+ crc4way crc32x, .L1, be
+SYM_FUNC_END(crc32_be_arm64_4way)
+
+ .section .rodata, "a", %progbits
+ .align 6
+.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27
+ .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e
+ .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b
+ .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8
+ .long 0x299847d5, 0x878a92a7, 0x39d3b296
+ .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53
+ .long 0xa60ce07b, 0x83348832, 0x47db8317
+ .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092
+ .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0
+ .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7
+ .long 0xf285651c, 0xce7f39f4, 0xdaece73e
+ .long 0x271d9844, 0xd270f1a2, 0xab7aff2a
+ .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385
+ .long 0xcec3662e, 0x1b03397f, 0x83348832
+ .long 0x8227bb8a, 0xb3e32c28, 0x299847d5
+ .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf6076544, 0x10746f3c, 0x18b33a4e
+ .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b
+ .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7
+ .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b
+ .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b
+ .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
+ .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56
+ .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
+ .long 0xa90fd27a, 0x0167d312, 0xc619809d
+ .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
+ .long 0x4597456a, 0x98d8d9cb, 0x65863b64
+ .long 0xc9c8b782, 0x68bce87a, 0x1b03397f
+ .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd
+ .long 0x2342001e, 0x3771e98f, 0xb3e32c28
+ .long 0xe8b6368b, 0x2178513a, 0x064f7f26
+ .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c
+ .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c
+ .long 0x02ee03b2, 0xff0dba97, 0x10746f3c
+ .long 0x135c83fd, 0xf872e54c, 0xc7a68855
+ .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844
+ .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
+ .long 0xded288f8, 0xb3af077a, 0x93a5f730
+ .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c
+ .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2
+ .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203
+ .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e
+ .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb
+ .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a
+ .long 0x8e1450f7, 0x2342001e, 0x8227bb8a
+ .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
+ .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35
+ .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
+ .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b
+ .long 0xd6c3a807, 0x2664fd8b, 0x0167d312
+ .long 0x1d31175f, 0x02ee03b2, 0xf6076544
+ .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a
+ .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf
+ .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
+ .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c
+ .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a
+ .long 0x8a074012, 0xded288f8, 0x57a3d037
+ .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b
+ .long 0x3be3c09b, 0x6353c1cc, 0x42d98888
+ .long 0x465a4eee, 0xf48642e9, 0x3771e98f
+ .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
+ .long 0xa52f58ec, 0x9a5ede41, 0x2178513a
+ .long 0x47972100, 0x45cddf4e, 0xe0ac139e
+ .long 0x359674f7, 0xa51b6135, 0x170076fa
+
+.L1: .long 0xaf449247, 0x81256527, 0xccaa009e
+ .long 0x57c54819, 0x1d9513d7, 0x81256527
+ .long 0x3f41287a, 0x57c54819, 0xaf449247
+ .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7
+ .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
+ .long 0x71d54a59, 0xf5e48c85, 0x57c54819
+ .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed
+ .long 0xd31343ea, 0xe95c1271, 0x910eeec1
+ .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a
+ .long 0x9ee62949, 0xcec97417, 0x9026d5b1
+ .long 0xa55d1514, 0xf183c71b, 0xd1df2327
+ .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85
+ .long 0x9d842b80, 0xeea395c4, 0x3c656ced
+ .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd
+ .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
+ .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271
+ .long 0xef82aa68, 0xdb3935ea, 0xb918a347
+ .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59
+ .long 0x99cce860, 0x356d209f, 0xff6f2fc2
+ .long 0xd8af8e46, 0xc352f6de, 0xcec97417
+ .long 0xf1996890, 0xd8110ff1, 0x1c63267b
+ .long 0x631bc508, 0xe95c7216, 0xf183c71b
+ .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0
+ .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea
+ .long 0x7a92fffb, 0xf7003835, 0x4470ac44
+ .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4
+ .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
+ .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40
+ .long 0x60290934, 0x81b6f443, 0x6d40f445
+ .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949
+ .long 0xdcf5088a, 0x9dbdc100, 0x145575d5
+ .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
+ .long 0x255b139e, 0x631bc508, 0xa55d1514
+ .long 0xd784eaa8, 0xce26786c, 0xdb3935ea
+ .long 0x6d2c864a, 0x8068c345, 0x2586d334
+ .long 0x02072e24, 0xdb3839f3, 0x21aa2b26
+ .long 0x06689b0a, 0x5efd72f5, 0xe0575528
+ .long 0x1e52f5ea, 0x4117915b, 0x356d209f
+ .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
+ .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de
+ .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
+ .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1
+ .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0
+ .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216
+ .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356
+ .long 0x0c540e7b, 0x753c81ff, 0x8e031a19
+ .long 0x9924c781, 0xb9220208, 0x3edcde65
+ .long 0x3954de39, 0x1753ab84, 0x1d6708a0
+ .long 0xf32238b5, 0xbec81497, 0x9e70b943
+ .long 0xbbd2cd2c, 0x0925d861, 0xf7003835
+ .long 0xcc401304, 0xd784eaa8, 0xef82aa68
+ .long 0x4987e684, 0x6044fbb0, 0x00eba0c8
+ .long 0x3aa11427, 0x18fe3b4a, 0x87441142
+ .long 0x297aad60, 0x02072e24, 0xd14bcc9b
+ .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
+ .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8
+ .long 0x25b8822a, 0x1e52f5ea, 0x99cce860
+ .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443
+ .long 0x5690aa32, 0xa91fdefb, 0x688a110e
+ .long 0x1357a093, 0x3796455c, 0xd8af8e46
+ .long 0x798fdd33, 0xaaa18a37, 0x357b9517
+ .long 0xc2815395, 0x54d42691, 0x9dbdc100
+ .long 0x21cfc0f7, 0x28ae0976, 0xf1996890
+ .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6
diff --git a/lib/crc/arm64/crc32.h b/lib/crc/arm64/crc32.h
new file mode 100644
index 000000000000..1939a5dee477
--- /dev/null
+++ b/lib/crc/arm64/crc32.h
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <asm/alternative.h>
+#include <asm/cpufeature.h>
+#include <asm/simd.h>
+
+// The minimum input length to consider the 4-way interleaved code path
+static const size_t min_len = 1024;
+
+asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
+
+asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+ return crc32_le_base(crc, p, len);
+
+ if (len >= min_len && cpu_have_named_feature(PMULL) &&
+ likely(may_use_simd())) {
+ scoped_ksimd()
+ crc = crc32_le_arm64_4way(crc, p, len);
+
+ p += round_down(len, 64);
+ len %= 64;
+
+ if (!len)
+ return crc;
+ }
+
+ return crc32_le_arm64(crc, p, len);
+}
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+ return crc32c_base(crc, p, len);
+
+ if (len >= min_len && cpu_have_named_feature(PMULL) &&
+ likely(may_use_simd())) {
+ scoped_ksimd()
+ crc = crc32c_le_arm64_4way(crc, p, len);
+
+ p += round_down(len, 64);
+ len %= 64;
+
+ if (!len)
+ return crc;
+ }
+
+ return crc32c_le_arm64(crc, p, len);
+}
+
+static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+ return crc32_be_base(crc, p, len);
+
+ if (len >= min_len && cpu_have_named_feature(PMULL) &&
+ likely(may_use_simd())) {
+ scoped_ksimd()
+ crc = crc32_be_arm64_4way(crc, p, len);
+
+ p += round_down(len, 64);
+ len %= 64;
+
+ if (!len)
+ return crc;
+ }
+
+ return crc32_be_arm64(crc, p, len);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (alternative_has_cap_likely(ARM64_HAS_CRC32))
+ return CRC32_LE_OPTIMIZATION |
+ CRC32_BE_OPTIMIZATION |
+ CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc-ccitt.c b/lib/crc/crc-ccitt.c
index 9cddf35d3b66..f8692c3de101 100644
--- a/lib/crc-ccitt.c
+++ b/lib/crc/crc-ccitt.c
@@ -1,11 +1,9 @@
// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/lib/crc-ccitt.c
- */
-#include <linux/types.h>
-#include <linux/module.h>
#include <linux/crc-ccitt.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
/*
* This mysterious table is just the CRC of each possible byte. It can be
diff --git a/lib/crc-itu-t.c b/lib/crc/crc-itu-t.c
index 1d26a1647da5..6e413a290f54 100644
--- a/lib/crc-itu-t.c
+++ b/lib/crc/crc-itu-t.c
@@ -3,9 +3,10 @@
* crc-itu-t.c
*/
-#include <linux/types.h>
-#include <linux/module.h>
#include <linux/crc-itu-t.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
/* CRC table for the CRC ITU-T V.41 0x1021 (x^16 + x^12 + x^5 + 1) */
const u16 crc_itu_t_table[256] = {
diff --git a/lib/crc-t10dif.c b/lib/crc/crc-t10dif-main.c
index 311c2ab829f1..08dde238e89f 100644
--- a/lib/crc-t10dif.c
+++ b/lib/crc/crc-t10dif-main.c
@@ -6,9 +6,10 @@
* Written by Martin K. Petersen <martin.petersen@oracle.com>
*/
-#include <linux/types.h>
-#include <linux/module.h>
#include <linux/crc-t10dif.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
/*
* Table generated using the following polynomial:
@@ -50,16 +51,39 @@ static const u16 t10_dif_crc_table[256] = {
0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
};
-u16 crc_t10dif_generic(u16 crc, const u8 *p, size_t len)
+static inline u16 __maybe_unused
+crc_t10dif_generic(u16 crc, const u8 *p, size_t len)
{
- size_t i;
+ while (len--)
+ crc = (crc << 8) ^ t10_dif_crc_table[(crc >> 8) ^ *p++];
+ return crc;
+}
- for (i = 0; i < len; i++)
- crc = (crc << 8) ^ t10_dif_crc_table[(crc >> 8) ^ p[i]];
+#ifdef CONFIG_CRC_T10DIF_ARCH
+#include "crc-t10dif.h" /* $(SRCARCH)/crc-t10dif.h */
+#else
+#define crc_t10dif_arch crc_t10dif_generic
+#endif
- return crc;
+u16 crc_t10dif_update(u16 crc, const u8 *p, size_t len)
+{
+ return crc_t10dif_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc_t10dif_update);
+
+#ifdef crc_t10dif_mod_init_arch
+static int __init crc_t10dif_mod_init(void)
+{
+ crc_t10dif_mod_init_arch();
+ return 0;
+}
+subsys_initcall(crc_t10dif_mod_init);
+
+static void __exit crc_t10dif_mod_exit(void)
+{
}
-EXPORT_SYMBOL(crc_t10dif_generic);
+module_exit(crc_t10dif_mod_exit);
+#endif
-MODULE_DESCRIPTION("T10 DIF CRC calculation");
+MODULE_DESCRIPTION("CRC-T10DIF library functions");
MODULE_LICENSE("GPL");
diff --git a/lib/crc16.c b/lib/crc/crc16.c
index 9c71eda9bf4b..931660a8cbaa 100644
--- a/lib/crc16.c
+++ b/lib/crc/crc16.c
@@ -3,9 +3,10 @@
* crc16.c
*/
-#include <linux/types.h>
-#include <linux/module.h>
#include <linux/crc16.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
/** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */
static const u16 crc16_table[256] = {
diff --git a/lib/crc/crc32-main.c b/lib/crc/crc32-main.c
new file mode 100644
index 000000000000..fbb90c9006e5
--- /dev/null
+++ b/lib/crc/crc32-main.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
+ * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks!
+ * Code was from the public domain, copyright abandoned. Code was
+ * subsequently included in the kernel, thus was re-licensed under the
+ * GNU GPL v2.
+ *
+ * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Same crc32 function was used in 5 other places in the kernel.
+ * I made one version, and deleted the others.
+ * There are various incantations of crc32(). Some use a seed of 0 or ~0.
+ * Some xor at the end with ~0. The generic crc32() function takes
+ * seed as an argument, and doesn't xor at the end. Then individual
+ * users can do whatever they need.
+ * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
+ * fs/jffs2 uses seed 0, doesn't xor with ~0.
+ * fs/partitions/efi.c uses seed ~0, xor's with ~0.
+ */
+
+/* see: Documentation/staging/crc32.rst for a description of algorithms */
+
+#include <linux/crc32.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "crc32table.h"
+
+static inline u32 __maybe_unused
+crc32_le_base(u32 crc, const u8 *p, size_t len)
+{
+ while (len--)
+ crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++];
+ return crc;
+}
+
+static inline u32 __maybe_unused
+crc32_be_base(u32 crc, const u8 *p, size_t len)
+{
+ while (len--)
+ crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++];
+ return crc;
+}
+
+static inline u32 __maybe_unused
+crc32c_base(u32 crc, const u8 *p, size_t len)
+{
+ while (len--)
+ crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++];
+ return crc;
+}
+
+#ifdef CONFIG_CRC32_ARCH
+#include "crc32.h" /* $(SRCARCH)/crc32.h */
+
+u32 crc32_optimizations(void)
+{
+ return crc32_optimizations_arch();
+}
+EXPORT_SYMBOL(crc32_optimizations);
+#else
+#define crc32_le_arch crc32_le_base
+#define crc32_be_arch crc32_be_base
+#define crc32c_arch crc32c_base
+#endif
+
+u32 crc32_le(u32 crc, const void *p, size_t len)
+{
+ return crc32_le_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_le);
+
+u32 crc32_be(u32 crc, const void *p, size_t len)
+{
+ return crc32_be_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_be);
+
+u32 crc32c(u32 crc, const void *p, size_t len)
+{
+ return crc32c_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc32c);
+
+#ifdef crc32_mod_init_arch
+static int __init crc32_mod_init(void)
+{
+ crc32_mod_init_arch();
+ return 0;
+}
+subsys_initcall(crc32_mod_init);
+
+static void __exit crc32_mod_exit(void)
+{
+}
+module_exit(crc32_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("CRC32 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc4.c b/lib/crc/crc4.c
index e7e1779c67d9..8e83fbe60bdc 100644
--- a/lib/crc4.c
+++ b/lib/crc/crc4.c
@@ -4,6 +4,7 @@
*/
#include <linux/crc4.h>
+#include <linux/export.h>
#include <linux/module.h>
static const uint8_t crc4_tab[] = {
diff --git a/lib/crc64.c b/lib/crc/crc64-main.c
index 5b1b17057f0a..1337036010fe 100644
--- a/lib/crc64.c
+++ b/lib/crc/crc64-main.c
@@ -33,26 +33,61 @@
* Author: Coly Li <colyli@suse.de>
*/
+#include <linux/crc64.h>
+#include <linux/export.h>
#include <linux/module.h>
#include <linux/types.h>
-#include <linux/crc64.h>
-#include "crc64table.h"
-MODULE_DESCRIPTION("CRC64 calculations");
-MODULE_LICENSE("GPL v2");
+#include "crc64table.h"
-u64 crc64_be_generic(u64 crc, const u8 *p, size_t len)
+static inline u64 __maybe_unused
+crc64_be_generic(u64 crc, const u8 *p, size_t len)
{
while (len--)
crc = (crc << 8) ^ crc64table[(crc >> 56) ^ *p++];
return crc;
}
-EXPORT_SYMBOL_GPL(crc64_be_generic);
-u64 crc64_nvme_generic(u64 crc, const u8 *p, size_t len)
+static inline u64 __maybe_unused
+crc64_nvme_generic(u64 crc, const u8 *p, size_t len)
{
while (len--)
crc = (crc >> 8) ^ crc64nvmetable[(crc & 0xff) ^ *p++];
return crc;
}
-EXPORT_SYMBOL_GPL(crc64_nvme_generic);
+
+#ifdef CONFIG_CRC64_ARCH
+#include "crc64.h" /* $(SRCARCH)/crc64.h */
+#else
+#define crc64_be_arch crc64_be_generic
+#define crc64_nvme_arch crc64_nvme_generic
+#endif
+
+u64 crc64_be(u64 crc, const void *p, size_t len)
+{
+ return crc64_be_arch(crc, p, len);
+}
+EXPORT_SYMBOL_GPL(crc64_be);
+
+u64 crc64_nvme(u64 crc, const void *p, size_t len)
+{
+ return ~crc64_nvme_arch(~crc, p, len);
+}
+EXPORT_SYMBOL_GPL(crc64_nvme);
+
+#ifdef crc64_mod_init_arch
+static int __init crc64_mod_init(void)
+{
+ crc64_mod_init_arch();
+ return 0;
+}
+subsys_initcall(crc64_mod_init);
+
+static void __exit crc64_mod_exit(void)
+{
+}
+module_exit(crc64_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("CRC64 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc7.c b/lib/crc/crc7.c
index 8dd991cc6114..46b95d7ac6ce 100644
--- a/lib/crc7.c
+++ b/lib/crc/crc7.c
@@ -3,9 +3,10 @@
* crc7.c
*/
-#include <linux/types.h>
-#include <linux/module.h>
#include <linux/crc7.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
/*
* Table for CRC-7 (polynomial x^7 + x^3 + 1).
diff --git a/lib/crc8.c b/lib/crc/crc8.c
index 1ad8e501d9b6..329c52158c45 100644
--- a/lib/crc8.c
+++ b/lib/crc/crc8.c
@@ -16,8 +16,9 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
#include <linux/crc8.h>
+#include <linux/export.h>
+#include <linux/module.h>
#include <linux/printk.h>
/**
diff --git a/lib/gen_crc32table.c b/lib/crc/gen_crc32table.c
index 6d03425b849e..9a7f31658e35 100644
--- a/lib/gen_crc32table.c
+++ b/lib/crc/gen_crc32table.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
-#include "../include/linux/crc32poly.h"
-#include "../include/generated/autoconf.h"
+#include "../../include/linux/crc32poly.h"
+#include "../../include/generated/autoconf.h"
#include <inttypes.h>
static uint32_t crc32table_le[256];
diff --git a/lib/gen_crc64table.c b/lib/crc/gen_crc64table.c
index e05a4230a0a0..f2be9f62bab7 100644
--- a/lib/gen_crc64table.c
+++ b/lib/crc/gen_crc64table.c
@@ -1,14 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Generate lookup table for the table-driven CRC64 calculation.
- *
- * gen_crc64table is executed in kernel build time and generates
- * lib/crc64table.h. This header is included by lib/crc64.c for
- * the table-driven CRC64 calculation.
- *
- * See lib/crc64.c for more information about which specification
- * and polynomial arithmetic that gen_crc64table.c follows to
- * generate the lookup table.
+ * This host program runs at kernel build time and generates the lookup tables
+ * used by the generic CRC64 code.
*
* Copyright 2018 SUSE Linux.
* Author: Coly Li <colyli@suse.de>
diff --git a/lib/crc/loongarch/crc32.h b/lib/crc/loongarch/crc32.h
new file mode 100644
index 000000000000..d34fa4c68632
--- /dev/null
+++ b/lib/crc/loongarch/crc32.h
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CRC32 and CRC32C using LoongArch crc* instructions
+ *
+ * Module based on mips/crypto/crc32-mips.c
+ *
+ * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
+ * Copyright (C) 2018 MIPS Tech, LLC
+ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
+ */
+
+#include <asm/cpu-features.h>
+#include <linux/unaligned.h>
+
+#define _CRC32(crc, value, size, type) \
+do { \
+ __asm__ __volatile__( \
+ #type ".w." #size ".w" " %0, %1, %0\n\t"\
+ : "+r" (crc) \
+ : "r" (value) \
+ : "memory"); \
+} while (0)
+
+#define CRC32(crc, value, size) _CRC32(crc, value, size, crc)
+#define CRC32C(crc, value, size) _CRC32(crc, value, size, crcc)
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!static_branch_likely(&have_crc32))
+ return crc32_le_base(crc, p, len);
+
+ while (len >= sizeof(u64)) {
+ u64 value = get_unaligned_le64(p);
+
+ CRC32(crc, value, d);
+ p += sizeof(u64);
+ len -= sizeof(u64);
+ }
+
+ if (len & sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32(crc, value, w);
+ p += sizeof(u32);
+ }
+
+ if (len & sizeof(u16)) {
+ u16 value = get_unaligned_le16(p);
+
+ CRC32(crc, value, h);
+ p += sizeof(u16);
+ }
+
+ if (len & sizeof(u8)) {
+ u8 value = *p++;
+
+ CRC32(crc, value, b);
+ }
+
+ return crc;
+}
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!static_branch_likely(&have_crc32))
+ return crc32c_base(crc, p, len);
+
+ while (len >= sizeof(u64)) {
+ u64 value = get_unaligned_le64(p);
+
+ CRC32C(crc, value, d);
+ p += sizeof(u64);
+ len -= sizeof(u64);
+ }
+
+ if (len & sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32C(crc, value, w);
+ p += sizeof(u32);
+ }
+
+ if (len & sizeof(u16)) {
+ u16 value = get_unaligned_le16(p);
+
+ CRC32C(crc, value, h);
+ p += sizeof(u16);
+ }
+
+ if (len & sizeof(u8)) {
+ u8 value = *p++;
+
+ CRC32C(crc, value, b);
+ }
+
+ return crc;
+}
+
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static void crc32_mod_init_arch(void)
+{
+ if (cpu_has_crc32)
+ static_branch_enable(&have_crc32);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (static_key_enabled(&have_crc32))
+ return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/mips/crc32.h b/lib/crc/mips/crc32.h
new file mode 100644
index 000000000000..3100354a049e
--- /dev/null
+++ b/lib/crc/mips/crc32.h
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * crc32-mips.c - CRC32 and CRC32C using optional MIPSr6 instructions
+ *
+ * Module based on arm64/crypto/crc32-arm.c
+ *
+ * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
+ * Copyright (C) 2018 MIPS Tech, LLC
+ */
+
+#include <linux/cpufeature.h>
+#include <asm/mipsregs.h>
+#include <linux/unaligned.h>
+
+#ifndef TOOLCHAIN_SUPPORTS_CRC
+#define _ASM_SET_CRC(OP, SZ, TYPE) \
+_ASM_MACRO_3R(OP, rt, rs, rt2, \
+ ".ifnc \\rt, \\rt2\n\t" \
+ ".error \"invalid operands \\\"" #OP " \\rt,\\rs,\\rt2\\\"\"\n\t" \
+ ".endif\n\t" \
+ _ASM_INSN_IF_MIPS(0x7c00000f | (__rt << 16) | (__rs << 21) | \
+ ((SZ) << 6) | ((TYPE) << 8)) \
+ _ASM_INSN32_IF_MM(0x00000030 | (__rs << 16) | (__rt << 21) | \
+ ((SZ) << 14) | ((TYPE) << 3)))
+#define _ASM_UNSET_CRC(op, SZ, TYPE) ".purgem " #op "\n\t"
+#else /* !TOOLCHAIN_SUPPORTS_CRC */
+#define _ASM_SET_CRC(op, SZ, TYPE) ".set\tcrc\n\t"
+#define _ASM_UNSET_CRC(op, SZ, TYPE)
+#endif
+
+#define __CRC32(crc, value, op, SZ, TYPE) \
+do { \
+ __asm__ __volatile__( \
+ ".set push\n\t" \
+ _ASM_SET_CRC(op, SZ, TYPE) \
+ #op " %0, %1, %0\n\t" \
+ _ASM_UNSET_CRC(op, SZ, TYPE) \
+ ".set pop" \
+ : "+r" (crc) \
+ : "r" (value)); \
+} while (0)
+
+#define _CRC32_crc32b(crc, value) __CRC32(crc, value, crc32b, 0, 0)
+#define _CRC32_crc32h(crc, value) __CRC32(crc, value, crc32h, 1, 0)
+#define _CRC32_crc32w(crc, value) __CRC32(crc, value, crc32w, 2, 0)
+#define _CRC32_crc32d(crc, value) __CRC32(crc, value, crc32d, 3, 0)
+#define _CRC32_crc32cb(crc, value) __CRC32(crc, value, crc32cb, 0, 1)
+#define _CRC32_crc32ch(crc, value) __CRC32(crc, value, crc32ch, 1, 1)
+#define _CRC32_crc32cw(crc, value) __CRC32(crc, value, crc32cw, 2, 1)
+#define _CRC32_crc32cd(crc, value) __CRC32(crc, value, crc32cd, 3, 1)
+
+#define _CRC32(crc, value, size, op) \
+ _CRC32_##op##size(crc, value)
+
+#define CRC32(crc, value, size) \
+ _CRC32(crc, value, size, crc32)
+
+#define CRC32C(crc, value, size) \
+ _CRC32(crc, value, size, crc32c)
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!static_branch_likely(&have_crc32))
+ return crc32_le_base(crc, p, len);
+
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) {
+ u64 value = get_unaligned_le64(p);
+
+ CRC32(crc, value, d);
+ }
+
+ if (len & sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32(crc, value, w);
+ p += sizeof(u32);
+ }
+ } else {
+ for (; len >= sizeof(u32); len -= sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32(crc, value, w);
+ p += sizeof(u32);
+ }
+ }
+
+ if (len & sizeof(u16)) {
+ u16 value = get_unaligned_le16(p);
+
+ CRC32(crc, value, h);
+ p += sizeof(u16);
+ }
+
+ if (len & sizeof(u8)) {
+ u8 value = *p++;
+
+ CRC32(crc, value, b);
+ }
+
+ return crc;
+}
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!static_branch_likely(&have_crc32))
+ return crc32c_base(crc, p, len);
+
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) {
+ u64 value = get_unaligned_le64(p);
+
+ CRC32C(crc, value, d);
+ }
+
+ if (len & sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32C(crc, value, w);
+ p += sizeof(u32);
+ }
+ } else {
+ for (; len >= sizeof(u32); len -= sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32C(crc, value, w);
+ p += sizeof(u32);
+ }
+ }
+
+ if (len & sizeof(u16)) {
+ u16 value = get_unaligned_le16(p);
+
+ CRC32C(crc, value, h);
+ p += sizeof(u16);
+ }
+
+ if (len & sizeof(u8)) {
+ u8 value = *p++;
+
+ CRC32C(crc, value, b);
+ }
+ return crc;
+}
+
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static void crc32_mod_init_arch(void)
+{
+ if (cpu_have_feature(cpu_feature(MIPS_CRC32)))
+ static_branch_enable(&have_crc32);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (static_key_enabled(&have_crc32))
+ return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/powerpc/crc-t10dif.h b/lib/crc/powerpc/crc-t10dif.h
new file mode 100644
index 000000000000..8f4592a5323d
--- /dev/null
+++ b/lib/crc/powerpc/crc-t10dif.h
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Calculate a CRC T10-DIF with vpmsum acceleration
+ *
+ * Copyright 2017, Daniel Axtens, IBM Corporation.
+ * [based on crc32c-vpmsum_glue.c]
+ */
+
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/preempt.h>
+#include <linux/uaccess.h>
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN-1)
+
+#define VECTOR_BREAKPOINT 64
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
+
+u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
+
+static inline u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len)
+{
+ unsigned int prealign;
+ unsigned int tail;
+ u32 crc = crci;
+
+ if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) ||
+ !static_branch_likely(&have_vec_crypto) ||
+ unlikely(!may_use_simd()))
+ return crc_t10dif_generic(crc, p, len);
+
+ if ((unsigned long)p & VMX_ALIGN_MASK) {
+ prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+ crc = crc_t10dif_generic(crc, p, prealign);
+ len -= prealign;
+ p += prealign;
+ }
+
+ if (len & ~VMX_ALIGN_MASK) {
+ crc <<= 16;
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_altivec();
+ crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+ disable_kernel_altivec();
+ pagefault_enable();
+ preempt_enable();
+ crc >>= 16;
+ }
+
+ tail = len & VMX_ALIGN_MASK;
+ if (tail) {
+ p += len & ~VMX_ALIGN_MASK;
+ crc = crc_t10dif_generic(crc, p, tail);
+ }
+
+ return crc & 0xffff;
+}
+
+#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
+static void crc_t10dif_mod_init_arch(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
+ static_branch_enable(&have_vec_crypto);
+}
diff --git a/lib/crc/powerpc/crc-vpmsum-template.S b/lib/crc/powerpc/crc-vpmsum-template.S
new file mode 100644
index 000000000000..b0f87f595b26
--- /dev/null
+++ b/lib/crc/powerpc/crc-vpmsum-template.S
@@ -0,0 +1,746 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Core of the accelerated CRC algorithm.
+ * In your file, define the constants and CRC_FUNCTION_NAME
+ * Then include this file.
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * https://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+*/
+
+#include <asm/ppc_asm.h>
+#include <asm/ppc-opcode.h>
+
+#define MAX_SIZE 32768
+
+ .text
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16 r25
+#define off32 r26
+#define off48 r27
+#define off64 r28
+#define off80 r29
+#define off96 r30
+#define off112 r31
+
+#define const1 v24
+#define const2 v25
+
+#define byteswap v26
+#define mask_32bit v27
+#define mask_64bit v28
+#define zeroes v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(CRC_FUNCTION_NAME)
+ std r31,-8(r1)
+ std r30,-16(r1)
+ std r29,-24(r1)
+ std r28,-32(r1)
+ std r27,-40(r1)
+ std r26,-48(r1)
+ std r25,-56(r1)
+
+ li off16,16
+ li off32,32
+ li off48,48
+ li off64,64
+ li off80,80
+ li off96,96
+ li off112,112
+ li r0,0
+
+ /* Enough room for saving 10 non volatile VMX registers */
+ subi r6,r1,56+10*16
+ subi r7,r1,56+2*16
+
+ stvx v20,0,r6
+ stvx v21,off16,r6
+ stvx v22,off32,r6
+ stvx v23,off48,r6
+ stvx v24,off64,r6
+ stvx v25,off80,r6
+ stvx v26,off96,r6
+ stvx v27,off112,r6
+ stvx v28,0,r7
+ stvx v29,off16,r7
+
+ mr r10,r3
+
+ vxor zeroes,zeroes,zeroes
+ vspltisw v0,-1
+
+ vsldoi mask_32bit,zeroes,v0,4
+ vsldoi mask_64bit,zeroes,v0,8
+
+ /* Get the initial value into v8 */
+ vxor v8,v8,v8
+ MTVRD(v8, R3)
+#ifdef REFLECT
+ vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
+#else
+ vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
+#endif
+
+#ifdef BYTESWAP_DATA
+ LOAD_REG_ADDR(r3, .byteswap_constant)
+ lvx byteswap,0,r3
+ addi r3,r3,16
+#endif
+
+ cmpdi r5,256
+ blt .Lshort
+
+ rldicr r6,r5,0,56
+
+ /* Checksum in blocks of MAX_SIZE */
+1: lis r7,MAX_SIZE@h
+ ori r7,r7,MAX_SIZE@l
+ mr r9,r7
+ cmpd r6,r7
+ bgt 2f
+ mr r7,r6
+2: subf r6,r7,r6
+
+ /* our main loop does 128 bytes at a time */
+ srdi r7,r7,7
+
+ /*
+ * Work out the offset into the constants table to start at. Each
+ * constant is 16 bytes, and it is used against 128 bytes of input
+ * data - 128 / 16 = 8
+ */
+ sldi r8,r7,4
+ srdi r9,r9,3
+ subf r8,r8,r9
+
+ /* We reduce our final 128 bytes in a separate step */
+ addi r7,r7,-1
+ mtctr r7
+
+ LOAD_REG_ADDR(r3, .constants)
+
+ /* Find the start of our constants */
+ add r3,r3,r8
+
+ /* zero v0-v7 which will contain our checksums */
+ vxor v0,v0,v0
+ vxor v1,v1,v1
+ vxor v2,v2,v2
+ vxor v3,v3,v3
+ vxor v4,v4,v4
+ vxor v5,v5,v5
+ vxor v6,v6,v6
+ vxor v7,v7,v7
+
+ lvx const1,0,r3
+
+ /*
+ * If we are looping back to consume more data we use the values
+ * already in v16-v23.
+ */
+ cmpdi r0,1
+ beq 2f
+
+ /* First warm up pass */
+ lvx v16,0,r4
+ lvx v17,off16,r4
+ VPERM(v16,v16,v16,byteswap)
+ VPERM(v17,v17,v17,byteswap)
+ lvx v18,off32,r4
+ lvx v19,off48,r4
+ VPERM(v18,v18,v18,byteswap)
+ VPERM(v19,v19,v19,byteswap)
+ lvx v20,off64,r4
+ lvx v21,off80,r4
+ VPERM(v20,v20,v20,byteswap)
+ VPERM(v21,v21,v21,byteswap)
+ lvx v22,off96,r4
+ lvx v23,off112,r4
+ VPERM(v22,v22,v22,byteswap)
+ VPERM(v23,v23,v23,byteswap)
+ addi r4,r4,8*16
+
+ /* xor in initial value */
+ vxor v16,v16,v8
+
+2: bdz .Lfirst_warm_up_done
+
+ addi r3,r3,16
+ lvx const2,0,r3
+
+ /* Second warm up pass */
+ VPMSUMD(v8,v16,const1)
+ lvx v16,0,r4
+ VPERM(v16,v16,v16,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v9,v17,const1)
+ lvx v17,off16,r4
+ VPERM(v17,v17,v17,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v10,v18,const1)
+ lvx v18,off32,r4
+ VPERM(v18,v18,v18,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v11,v19,const1)
+ lvx v19,off48,r4
+ VPERM(v19,v19,v19,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v12,v20,const1)
+ lvx v20,off64,r4
+ VPERM(v20,v20,v20,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v13,v21,const1)
+ lvx v21,off80,r4
+ VPERM(v21,v21,v21,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v14,v22,const1)
+ lvx v22,off96,r4
+ VPERM(v22,v22,v22,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v15,v23,const1)
+ lvx v23,off112,r4
+ VPERM(v23,v23,v23,byteswap)
+
+ addi r4,r4,8*16
+
+ bdz .Lfirst_cool_down
+
+ /*
+ * main loop. We modulo schedule it such that it takes three iterations
+ * to complete - first iteration load, second iteration vpmsum, third
+ * iteration xor.
+ */
+ .balign 16
+4: lvx const1,0,r3
+ addi r3,r3,16
+ ori r2,r2,0
+
+ vxor v0,v0,v8
+ VPMSUMD(v8,v16,const2)
+ lvx v16,0,r4
+ VPERM(v16,v16,v16,byteswap)
+ ori r2,r2,0
+
+ vxor v1,v1,v9
+ VPMSUMD(v9,v17,const2)
+ lvx v17,off16,r4
+ VPERM(v17,v17,v17,byteswap)
+ ori r2,r2,0
+
+ vxor v2,v2,v10
+ VPMSUMD(v10,v18,const2)
+ lvx v18,off32,r4
+ VPERM(v18,v18,v18,byteswap)
+ ori r2,r2,0
+
+ vxor v3,v3,v11
+ VPMSUMD(v11,v19,const2)
+ lvx v19,off48,r4
+ VPERM(v19,v19,v19,byteswap)
+ lvx const2,0,r3
+ ori r2,r2,0
+
+ vxor v4,v4,v12
+ VPMSUMD(v12,v20,const1)
+ lvx v20,off64,r4
+ VPERM(v20,v20,v20,byteswap)
+ ori r2,r2,0
+
+ vxor v5,v5,v13
+ VPMSUMD(v13,v21,const1)
+ lvx v21,off80,r4
+ VPERM(v21,v21,v21,byteswap)
+ ori r2,r2,0
+
+ vxor v6,v6,v14
+ VPMSUMD(v14,v22,const1)
+ lvx v22,off96,r4
+ VPERM(v22,v22,v22,byteswap)
+ ori r2,r2,0
+
+ vxor v7,v7,v15
+ VPMSUMD(v15,v23,const1)
+ lvx v23,off112,r4
+ VPERM(v23,v23,v23,byteswap)
+
+ addi r4,r4,8*16
+
+ bdnz 4b
+
+.Lfirst_cool_down:
+ /* First cool down pass */
+ lvx const1,0,r3
+ addi r3,r3,16
+
+ vxor v0,v0,v8
+ VPMSUMD(v8,v16,const1)
+ ori r2,r2,0
+
+ vxor v1,v1,v9
+ VPMSUMD(v9,v17,const1)
+ ori r2,r2,0
+
+ vxor v2,v2,v10
+ VPMSUMD(v10,v18,const1)
+ ori r2,r2,0
+
+ vxor v3,v3,v11
+ VPMSUMD(v11,v19,const1)
+ ori r2,r2,0
+
+ vxor v4,v4,v12
+ VPMSUMD(v12,v20,const1)
+ ori r2,r2,0
+
+ vxor v5,v5,v13
+ VPMSUMD(v13,v21,const1)
+ ori r2,r2,0
+
+ vxor v6,v6,v14
+ VPMSUMD(v14,v22,const1)
+ ori r2,r2,0
+
+ vxor v7,v7,v15
+ VPMSUMD(v15,v23,const1)
+ ori r2,r2,0
+
+.Lsecond_cool_down:
+ /* Second cool down pass */
+ vxor v0,v0,v8
+ vxor v1,v1,v9
+ vxor v2,v2,v10
+ vxor v3,v3,v11
+ vxor v4,v4,v12
+ vxor v5,v5,v13
+ vxor v6,v6,v14
+ vxor v7,v7,v15
+
+#ifdef REFLECT
+ /*
+ * vpmsumd produces a 96 bit result in the least significant bits
+ * of the register. Since we are bit reflected we have to shift it
+ * left 32 bits so it occupies the least significant bits in the
+ * bit reflected domain.
+ */
+ vsldoi v0,v0,zeroes,4
+ vsldoi v1,v1,zeroes,4
+ vsldoi v2,v2,zeroes,4
+ vsldoi v3,v3,zeroes,4
+ vsldoi v4,v4,zeroes,4
+ vsldoi v5,v5,zeroes,4
+ vsldoi v6,v6,zeroes,4
+ vsldoi v7,v7,zeroes,4
+#endif
+
+ /* xor with last 1024 bits */
+ lvx v8,0,r4
+ lvx v9,off16,r4
+ VPERM(v8,v8,v8,byteswap)
+ VPERM(v9,v9,v9,byteswap)
+ lvx v10,off32,r4
+ lvx v11,off48,r4
+ VPERM(v10,v10,v10,byteswap)
+ VPERM(v11,v11,v11,byteswap)
+ lvx v12,off64,r4
+ lvx v13,off80,r4
+ VPERM(v12,v12,v12,byteswap)
+ VPERM(v13,v13,v13,byteswap)
+ lvx v14,off96,r4
+ lvx v15,off112,r4
+ VPERM(v14,v14,v14,byteswap)
+ VPERM(v15,v15,v15,byteswap)
+
+ addi r4,r4,8*16
+
+ vxor v16,v0,v8
+ vxor v17,v1,v9
+ vxor v18,v2,v10
+ vxor v19,v3,v11
+ vxor v20,v4,v12
+ vxor v21,v5,v13
+ vxor v22,v6,v14
+ vxor v23,v7,v15
+
+ li r0,1
+ cmpdi r6,0
+ addi r6,r6,128
+ bne 1b
+
+ /* Work out how many bytes we have left */
+ andi. r5,r5,127
+
+ /* Calculate where in the constant table we need to start */
+ subfic r6,r5,128
+ add r3,r3,r6
+
+ /* How many 16 byte chunks are in the tail */
+ srdi r7,r5,4
+ mtctr r7
+
+ /*
+ * Reduce the previously calculated 1024 bits to 64 bits, shifting
+ * 32 bits to include the trailing 32 bits of zeros
+ */
+ lvx v0,0,r3
+ lvx v1,off16,r3
+ lvx v2,off32,r3
+ lvx v3,off48,r3
+ lvx v4,off64,r3
+ lvx v5,off80,r3
+ lvx v6,off96,r3
+ lvx v7,off112,r3
+ addi r3,r3,8*16
+
+ VPMSUMW(v0,v16,v0)
+ VPMSUMW(v1,v17,v1)
+ VPMSUMW(v2,v18,v2)
+ VPMSUMW(v3,v19,v3)
+ VPMSUMW(v4,v20,v4)
+ VPMSUMW(v5,v21,v5)
+ VPMSUMW(v6,v22,v6)
+ VPMSUMW(v7,v23,v7)
+
+ /* Now reduce the tail (0 - 112 bytes) */
+ cmpdi r7,0
+ beq 1f
+
+ lvx v16,0,r4
+ lvx v17,0,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off16,r4
+ lvx v17,off16,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off32,r4
+ lvx v17,off32,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off48,r4
+ lvx v17,off48,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off64,r4
+ lvx v17,off64,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off80,r4
+ lvx v17,off80,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off96,r4
+ lvx v17,off96,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+
+ /* Now xor all the parallel chunks together */
+1: vxor v0,v0,v1
+ vxor v2,v2,v3
+ vxor v4,v4,v5
+ vxor v6,v6,v7
+
+ vxor v0,v0,v2
+ vxor v4,v4,v6
+
+ vxor v0,v0,v4
+
+.Lbarrett_reduction:
+ /* Barrett constants */
+ LOAD_REG_ADDR(r3, .barrett_constants)
+
+ lvx const1,0,r3
+ lvx const2,off16,r3
+
+ vsldoi v1,v0,v0,8
+ vxor v0,v0,v1 /* xor two 64 bit results together */
+
+#ifdef REFLECT
+ /* shift left one bit */
+ vspltisb v1,1
+ vsl v0,v0,v1
+#endif
+
+ vand v0,v0,mask_64bit
+#ifndef REFLECT
+ /*
+ * Now for the Barrett reduction algorithm. The idea is to calculate q,
+ * the multiple of our polynomial that we need to subtract. By
+ * doing the computation 2x bits higher (ie 64 bits) and shifting the
+ * result back down 2x bits, we round down to the nearest multiple.
+ */
+ VPMSUMD(v1,v0,const1) /* ma */
+ vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
+ VPMSUMD(v1,v1,const2) /* qn */
+ vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Get the result into r3. We need to shift it left 8 bytes:
+ * V0 [ 0 1 2 X ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
+#else
+ /*
+ * The reflected version of Barrett reduction. Instead of bit
+ * reflecting our data (which is expensive to do), we bit reflect our
+ * constants and our algorithm, which means the intermediate data in
+ * our vector registers goes from 0-63 instead of 63-0. We can reflect
+ * the algorithm because we don't carry in mod 2 arithmetic.
+ */
+ vand v1,v0,mask_32bit /* bottom 32 bits of a */
+ VPMSUMD(v1,v1,const1) /* ma */
+ vand v1,v1,mask_32bit /* bottom 32bits of ma */
+ VPMSUMD(v1,v1,const2) /* qn */
+ vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Since we are bit reflected, the result (ie the low 32 bits) is in
+ * the high 32 bits. We just need to shift it left 4 bytes
+ * V0 [ 0 1 X 3 ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
+#endif
+
+ /* Get it into r3 */
+ MFVRD(R3, v0)
+
+.Lout:
+ subi r6,r1,56+10*16
+ subi r7,r1,56+2*16
+
+ lvx v20,0,r6
+ lvx v21,off16,r6
+ lvx v22,off32,r6
+ lvx v23,off48,r6
+ lvx v24,off64,r6
+ lvx v25,off80,r6
+ lvx v26,off96,r6
+ lvx v27,off112,r6
+ lvx v28,0,r7
+ lvx v29,off16,r7
+
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+ ld r26,-48(r1)
+ ld r25,-56(r1)
+
+ blr
+
+.Lfirst_warm_up_done:
+ lvx const1,0,r3
+ addi r3,r3,16
+
+ VPMSUMD(v8,v16,const1)
+ VPMSUMD(v9,v17,const1)
+ VPMSUMD(v10,v18,const1)
+ VPMSUMD(v11,v19,const1)
+ VPMSUMD(v12,v20,const1)
+ VPMSUMD(v13,v21,const1)
+ VPMSUMD(v14,v22,const1)
+ VPMSUMD(v15,v23,const1)
+
+ b .Lsecond_cool_down
+
+.Lshort:
+ cmpdi r5,0
+ beq .Lzero
+
+ LOAD_REG_ADDR(r3, .short_constants)
+
+ /* Calculate where in the constant table we need to start */
+ subfic r6,r5,256
+ add r3,r3,r6
+
+ /* How many 16 byte chunks? */
+ srdi r7,r5,4
+ mtctr r7
+
+ vxor v19,v19,v19
+ vxor v20,v20,v20
+
+ lvx v0,0,r4
+ lvx v16,0,r3
+ VPERM(v0,v0,v16,byteswap)
+ vxor v0,v0,v8 /* xor in initial value */
+ VPMSUMW(v0,v0,v16)
+ bdz .Lv0
+
+ lvx v1,off16,r4
+ lvx v17,off16,r3
+ VPERM(v1,v1,v17,byteswap)
+ VPMSUMW(v1,v1,v17)
+ bdz .Lv1
+
+ lvx v2,off32,r4
+ lvx v16,off32,r3
+ VPERM(v2,v2,v16,byteswap)
+ VPMSUMW(v2,v2,v16)
+ bdz .Lv2
+
+ lvx v3,off48,r4
+ lvx v17,off48,r3
+ VPERM(v3,v3,v17,byteswap)
+ VPMSUMW(v3,v3,v17)
+ bdz .Lv3
+
+ lvx v4,off64,r4
+ lvx v16,off64,r3
+ VPERM(v4,v4,v16,byteswap)
+ VPMSUMW(v4,v4,v16)
+ bdz .Lv4
+
+ lvx v5,off80,r4
+ lvx v17,off80,r3
+ VPERM(v5,v5,v17,byteswap)
+ VPMSUMW(v5,v5,v17)
+ bdz .Lv5
+
+ lvx v6,off96,r4
+ lvx v16,off96,r3
+ VPERM(v6,v6,v16,byteswap)
+ VPMSUMW(v6,v6,v16)
+ bdz .Lv6
+
+ lvx v7,off112,r4
+ lvx v17,off112,r3
+ VPERM(v7,v7,v17,byteswap)
+ VPMSUMW(v7,v7,v17)
+ bdz .Lv7
+
+ addi r3,r3,128
+ addi r4,r4,128
+
+ lvx v8,0,r4
+ lvx v16,0,r3
+ VPERM(v8,v8,v16,byteswap)
+ VPMSUMW(v8,v8,v16)
+ bdz .Lv8
+
+ lvx v9,off16,r4
+ lvx v17,off16,r3
+ VPERM(v9,v9,v17,byteswap)
+ VPMSUMW(v9,v9,v17)
+ bdz .Lv9
+
+ lvx v10,off32,r4
+ lvx v16,off32,r3
+ VPERM(v10,v10,v16,byteswap)
+ VPMSUMW(v10,v10,v16)
+ bdz .Lv10
+
+ lvx v11,off48,r4
+ lvx v17,off48,r3
+ VPERM(v11,v11,v17,byteswap)
+ VPMSUMW(v11,v11,v17)
+ bdz .Lv11
+
+ lvx v12,off64,r4
+ lvx v16,off64,r3
+ VPERM(v12,v12,v16,byteswap)
+ VPMSUMW(v12,v12,v16)
+ bdz .Lv12
+
+ lvx v13,off80,r4
+ lvx v17,off80,r3
+ VPERM(v13,v13,v17,byteswap)
+ VPMSUMW(v13,v13,v17)
+ bdz .Lv13
+
+ lvx v14,off96,r4
+ lvx v16,off96,r3
+ VPERM(v14,v14,v16,byteswap)
+ VPMSUMW(v14,v14,v16)
+ bdz .Lv14
+
+ lvx v15,off112,r4
+ lvx v17,off112,r3
+ VPERM(v15,v15,v17,byteswap)
+ VPMSUMW(v15,v15,v17)
+
+.Lv15: vxor v19,v19,v15
+.Lv14: vxor v20,v20,v14
+.Lv13: vxor v19,v19,v13
+.Lv12: vxor v20,v20,v12
+.Lv11: vxor v19,v19,v11
+.Lv10: vxor v20,v20,v10
+.Lv9: vxor v19,v19,v9
+.Lv8: vxor v20,v20,v8
+.Lv7: vxor v19,v19,v7
+.Lv6: vxor v20,v20,v6
+.Lv5: vxor v19,v19,v5
+.Lv4: vxor v20,v20,v4
+.Lv3: vxor v19,v19,v3
+.Lv2: vxor v20,v20,v2
+.Lv1: vxor v19,v19,v1
+.Lv0: vxor v20,v20,v0
+
+ vxor v0,v19,v20
+
+ b .Lbarrett_reduction
+
+.Lzero:
+ mr r3,r10
+ b .Lout
+
+FUNC_END(CRC_FUNCTION_NAME)
diff --git a/lib/crc/powerpc/crc32.h b/lib/crc/powerpc/crc32.h
new file mode 100644
index 000000000000..0c852272a382
--- /dev/null
+++ b/lib/crc/powerpc/crc32.h
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/preempt.h>
+#include <linux/uaccess.h>
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN-1)
+
+#define VECTOR_BREAKPOINT 512
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
+
+#define crc32_le_arch crc32_le_base /* not implemented on this arch */
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+u32 __crc32c_vpmsum(u32 crc, const u8 *p, size_t len);
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ unsigned int prealign;
+ unsigned int tail;
+
+ if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) ||
+ !static_branch_likely(&have_vec_crypto) ||
+ unlikely(!may_use_simd()))
+ return crc32c_base(crc, p, len);
+
+ if ((unsigned long)p & VMX_ALIGN_MASK) {
+ prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+ crc = crc32c_base(crc, p, prealign);
+ len -= prealign;
+ p += prealign;
+ }
+
+ if (len & ~VMX_ALIGN_MASK) {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_altivec();
+ crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+ disable_kernel_altivec();
+ pagefault_enable();
+ preempt_enable();
+ }
+
+ tail = len & VMX_ALIGN_MASK;
+ if (tail) {
+ p += len & ~VMX_ALIGN_MASK;
+ crc = crc32c_base(crc, p, tail);
+ }
+
+ return crc;
+}
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static void crc32_mod_init_arch(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
+ static_branch_enable(&have_vec_crypto);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (static_key_enabled(&have_vec_crypto))
+ return CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/powerpc/crc32c-vpmsum_asm.S b/lib/crc/powerpc/crc32c-vpmsum_asm.S
new file mode 100644
index 000000000000..1b35c55cce0a
--- /dev/null
+++ b/lib/crc/powerpc/crc32c-vpmsum_asm.S
@@ -0,0 +1,842 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Calculate a crc32c with vpmsum acceleration
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ */
+ .section .rodata
+.balign 16
+
+.byteswap_constant:
+ /* byte reverse permute constant */
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+.constants:
+
+ /* Reduce 262144 kbits to 1024 bits */
+ /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+ .octa 0x00000000b6ca9e20000000009c37c408
+
+ /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+ .octa 0x00000000350249a800000001b51df26c
+
+ /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+ .octa 0x00000001862dac54000000000724b9d0
+
+ /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+ .octa 0x00000001d87fb48c00000001c00532fe
+
+ /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+ .octa 0x00000001f39b699e00000000f05a9362
+
+ /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+ .octa 0x0000000101da11b400000001e1007970
+
+ /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+ .octa 0x00000001cab571e000000000a57366ee
+
+ /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+ .octa 0x00000000c7020cfe0000000192011284
+
+ /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+ .octa 0x00000000cdaed1ae0000000162716d9a
+
+ /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+ .octa 0x00000001e804effc00000000cd97ecde
+
+ /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+ .octa 0x0000000077c3ea3a0000000058812bc0
+
+ /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+ .octa 0x0000000068df31b40000000088b8c12e
+
+ /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+ .octa 0x00000000b059b6c200000001230b234c
+
+ /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+ .octa 0x0000000145fb8ed800000001120b416e
+
+ /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+ .octa 0x00000000cbc0916800000001974aecb0
+
+ /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+ .octa 0x000000005ceeedc2000000008ee3f226
+
+ /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+ .octa 0x0000000047d74e8600000001089aba9a
+
+ /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+ .octa 0x00000001407e9e220000000065113872
+
+ /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+ .octa 0x00000001da967bda000000005c07ec10
+
+ /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+ .octa 0x000000006c8983680000000187590924
+
+ /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+ .octa 0x00000000f2d14c9800000000e35da7c6
+
+ /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+ .octa 0x00000001993c6ad4000000000415855a
+
+ /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+ .octa 0x000000014683d1ac0000000073617758
+
+ /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+ .octa 0x00000001a7c93e6c0000000176021d28
+
+ /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+ .octa 0x000000010211e90a00000001c358fd0a
+
+ /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+ .octa 0x000000001119403e00000001ff7a2c18
+
+ /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+ .octa 0x000000001c3261aa00000000f2d9f7e4
+
+ /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+ .octa 0x000000014e37a634000000016cf1f9c8
+
+ /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+ .octa 0x0000000073786c0c000000010af9279a
+
+ /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+ .octa 0x000000011dc037f80000000004f101e8
+
+ /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+ .octa 0x0000000031433dfc0000000070bcf184
+
+ /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+ .octa 0x000000009cde8348000000000a8de642
+
+ /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+ .octa 0x0000000038d3c2a60000000062ea130c
+
+ /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+ .octa 0x000000011b25f26000000001eb31cbb2
+
+ /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+ .octa 0x000000001629e6f00000000170783448
+
+ /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+ .octa 0x0000000160838b4c00000001a684b4c6
+
+ /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+ .octa 0x000000007a44011c00000000253ca5b4
+
+ /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+ .octa 0x00000000226f417a0000000057b4b1e2
+
+ /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+ .octa 0x0000000045eb2eb400000000b6bd084c
+
+ /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+ .octa 0x000000014459d70c0000000123c2d592
+
+ /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+ .octa 0x00000001d406ed8200000000159dafce
+
+ /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+ .octa 0x0000000160c8e1a80000000127e1a64e
+
+ /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+ .octa 0x0000000027ba80980000000056860754
+
+ /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+ .octa 0x000000006d92d01800000001e661aae8
+
+ /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+ .octa 0x000000012ed7e3f200000000f82c6166
+
+ /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+ .octa 0x000000002dc8778800000000c4f9c7ae
+
+ /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+ .octa 0x0000000018240bb80000000074203d20
+
+ /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+ .octa 0x000000001ad381580000000198173052
+
+ /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+ .octa 0x00000001396b78f200000001ce8aba54
+
+ /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+ .octa 0x000000011a68133400000001850d5d94
+
+ /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+ .octa 0x000000012104732e00000001d609239c
+
+ /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+ .octa 0x00000000a140d90c000000001595f048
+
+ /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+ .octa 0x00000001b7215eda0000000042ccee08
+
+ /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+ .octa 0x00000001aaf1df3c000000010a389d74
+
+ /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+ .octa 0x0000000029d15b8a000000012a840da6
+
+ /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+ .octa 0x00000000f1a96922000000001d181c0c
+
+ /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+ .octa 0x00000001ac80d03c0000000068b7d1f6
+
+ /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+ .octa 0x000000000f11d56a000000005b0f14fc
+
+ /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+ .octa 0x00000001f1c022a20000000179e9e730
+
+ /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+ .octa 0x0000000173d00ae200000001ce1368d6
+
+ /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+ .octa 0x00000001d4ffe4ac0000000112c3a84c
+
+ /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+ .octa 0x000000016edc5ae400000000de940fee
+
+ /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+ .octa 0x00000001f1a0214000000000fe896b7e
+
+ /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+ .octa 0x00000000ca0b28a000000001f797431c
+
+ /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+ .octa 0x00000001928e30a20000000053e989ba
+
+ /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+ .octa 0x0000000097b1b002000000003920cd16
+
+ /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+ .octa 0x00000000b15bf90600000001e6f579b8
+
+ /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+ .octa 0x00000000411c5d52000000007493cb0a
+
+ /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+ .octa 0x00000001c36f330000000001bdd376d8
+
+ /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+ .octa 0x00000001119227e0000000016badfee6
+
+ /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+ .octa 0x00000000114d47020000000071de5c58
+
+ /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+ .octa 0x00000000458b5b9800000000453f317c
+
+ /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+ .octa 0x000000012e31fb8e0000000121675cce
+
+ /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+ .octa 0x000000005cf619d800000001f409ee92
+
+ /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+ .octa 0x0000000063f4d8b200000000f36b9c88
+
+ /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+ .octa 0x000000004138dc8a0000000036b398f4
+
+ /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+ .octa 0x00000001d29ee8e000000001748f9adc
+
+ /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+ .octa 0x000000006a08ace800000001be94ec00
+
+ /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+ .octa 0x0000000127d4201000000000b74370d6
+
+ /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+ .octa 0x0000000019d76b6200000001174d0b98
+
+ /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+ .octa 0x00000001b1471f6e00000000befc06a4
+
+ /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+ .octa 0x00000001f64c19cc00000001ae125288
+
+ /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+ .octa 0x00000000003c0ea00000000095c19b34
+
+ /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+ .octa 0x000000014d73abf600000001a78496f2
+
+ /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+ .octa 0x00000001620eb84400000001ac5390a0
+
+ /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+ .octa 0x0000000147655048000000002a80ed6e
+
+ /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+ .octa 0x0000000067b5077e00000001fa9b0128
+
+ /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+ .octa 0x0000000010ffe20600000001ea94929e
+
+ /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+ .octa 0x000000000fee8f1e0000000125f4305c
+
+ /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+ .octa 0x00000001da26fbae00000001471e2002
+
+ /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+ .octa 0x00000001b3a8bd880000000132d2253a
+
+ /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+ .octa 0x00000000e8f3898e00000000f26b3592
+
+ /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+ .octa 0x00000000b0d0d28c00000000bc8b67b0
+
+ /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+ .octa 0x0000000030f2a798000000013a826ef2
+
+ /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+ .octa 0x000000000fba10020000000081482c84
+
+ /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+ .octa 0x00000000bdb9bd7200000000e77307c2
+
+ /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+ .octa 0x0000000075d3bf5a00000000d4a07ec8
+
+ /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+ .octa 0x00000000ef1f98a00000000017102100
+
+ /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+ .octa 0x00000000689c760200000000db406486
+
+ /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+ .octa 0x000000016d5fa5fe0000000192db7f88
+
+ /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+ .octa 0x00000001d0d2b9ca000000018bf67b1e
+
+ /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+ .octa 0x0000000041e7b470000000007c09163e
+
+ /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+ .octa 0x00000001cbb6495e000000000adac060
+
+ /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+ .octa 0x000000010052a0b000000000bd8316ae
+
+ /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+ .octa 0x00000001d8effb5c000000019f09ab54
+
+ /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+ .octa 0x00000001d969853c0000000125155542
+
+ /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+ .octa 0x00000000523ccce2000000018fdb5882
+
+ /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+ .octa 0x000000001e2436bc00000000e794b3f4
+
+ /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+ .octa 0x00000000ddd1c3a2000000016f9bb022
+
+ /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+ .octa 0x0000000019fcfe3800000000290c9978
+
+ /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+ .octa 0x00000001ce95db640000000083c0f350
+
+ /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+ .octa 0x00000000af5828060000000173ea6628
+
+ /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+ .octa 0x00000001006388f600000001c8b4e00a
+
+ /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+ .octa 0x0000000179eca00a00000000de95d6aa
+
+ /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+ .octa 0x0000000122410a6a000000010b7f7248
+
+ /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+ .octa 0x000000004288e87c00000001326e3a06
+
+ /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+ .octa 0x000000016c5490da00000000bb62c2e6
+
+ /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+ .octa 0x00000000d1c71f6e0000000156a4b2c2
+
+ /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+ .octa 0x00000001b4ce08a6000000011dfe763a
+
+ /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+ .octa 0x00000001466ba60c000000007bcca8e2
+
+ /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+ .octa 0x00000001f6c488a40000000186118faa
+
+ /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+ .octa 0x000000013bfb06820000000111a65a88
+
+ /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+ .octa 0x00000000690e9e54000000003565e1c4
+
+ /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+ .octa 0x00000000281346b6000000012ed02a82
+
+ /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+ .octa 0x000000015646402400000000c486ecfc
+
+ /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+ .octa 0x000000016063a8dc0000000001b951b2
+
+ /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+ .octa 0x0000000116a663620000000048143916
+
+ /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+ .octa 0x000000017e8aa4d200000001dc2ae124
+
+ /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+ .octa 0x00000001728eb10c00000001416c58d6
+
+ /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+ .octa 0x00000001b08fd7fa00000000a479744a
+
+ /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+ .octa 0x00000001092a16e80000000096ca3a26
+
+ /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+ .octa 0x00000000a505637c00000000ff223d4e
+
+ /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+ .octa 0x00000000d94869b2000000010e84da42
+
+ /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+ .octa 0x00000001c8b203ae00000001b61ba3d0
+
+ /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+ .octa 0x000000005704aea000000000680f2de8
+
+ /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+ .octa 0x000000012e295fa2000000008772a9a8
+
+ /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+ .octa 0x000000011d0908bc0000000155f295bc
+
+ /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+ .octa 0x0000000193ed97ea00000000595f9282
+
+ /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+ .octa 0x000000013a0f1c520000000164b1c25a
+
+ /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+ .octa 0x000000010c2c40c000000000fbd67c50
+
+ /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+ .octa 0x00000000ff6fac3e0000000096076268
+
+ /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+ .octa 0x000000017b3609c000000001d288e4cc
+
+ /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+ .octa 0x0000000088c8c92200000001eaac1bdc
+
+ /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+ .octa 0x00000001751baae600000001f1ea39e2
+
+ /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+ .octa 0x000000010795297200000001eb6506fc
+
+ /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+ .octa 0x0000000162b00abe000000010f806ffe
+
+ /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+ .octa 0x000000000d7b404c000000010408481e
+
+ /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+ .octa 0x00000000763b13d40000000188260534
+
+ /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+ .octa 0x00000000f6dc22d80000000058fc73e0
+
+ /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+ .octa 0x000000007daae06000000000391c59b8
+
+ /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+ .octa 0x000000013359ab7c000000018b638400
+
+ /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+ .octa 0x000000008add438a000000011738f5c4
+
+ /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+ .octa 0x00000001edbefdea000000008cf7c6da
+
+ /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+ .octa 0x000000004104e0f800000001ef97fb16
+
+ /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+ .octa 0x00000000b48a82220000000102130e20
+
+ /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+ .octa 0x00000001bcb4684400000000db968898
+
+ /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+ .octa 0x000000013293ce0a00000000b5047b5e
+
+ /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+ .octa 0x00000001710d0844000000010b90fdb2
+
+ /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+ .octa 0x0000000117907f6e000000004834a32e
+
+ /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+ .octa 0x0000000087ddf93e0000000059c8f2b0
+
+ /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+ .octa 0x000000005970e9b00000000122cec508
+
+ /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+ .octa 0x0000000185b2b7d0000000000a330cda
+
+ /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+ .octa 0x00000001dcee0efc000000014a47148c
+
+ /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+ .octa 0x0000000030da27220000000042c61cb8
+
+ /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+ .octa 0x000000012f925a180000000012fe6960
+
+ /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+ .octa 0x00000000dd2e357c00000000dbda2c20
+
+ /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+ .octa 0x00000000071c80de000000011122410c
+
+ /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+ .octa 0x000000011513140a00000000977b2070
+
+ /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+ .octa 0x00000001df876e8e000000014050438e
+
+ /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+ .octa 0x000000015f81d6ce0000000147c840e8
+
+ /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+ .octa 0x000000019dd94dbe00000001cc7c88ce
+
+ /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+ .octa 0x00000001373d206e00000001476b35a4
+
+ /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+ .octa 0x00000000668ccade000000013d52d508
+
+ /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+ .octa 0x00000001b192d268000000008e4be32e
+
+ /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+ .octa 0x00000000e30f3a7800000000024120fe
+
+ /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+ .octa 0x000000010ef1f7bc00000000ddecddb4
+
+ /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+ .octa 0x00000001f5ac738000000000d4d403bc
+
+ /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+ .octa 0x000000011822ea7000000001734b89aa
+
+ /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+ .octa 0x00000000c3a33848000000010e7a58d6
+
+ /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+ .octa 0x00000001bd151c2400000001f9f04e9c
+
+ /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+ .octa 0x0000000056002d7600000000b692225e
+
+ /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+ .octa 0x000000014657c4f4000000019b8d3f3e
+
+ /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+ .octa 0x0000000113742d7c00000001a874f11e
+
+ /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+ .octa 0x000000019c5920ba000000010d5a4254
+
+ /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+ .octa 0x000000005216d2d600000000bbb2f5d6
+
+ /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+ .octa 0x0000000136f5ad8a0000000179cc0e36
+
+ /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+ .octa 0x000000018b07beb600000001dca1da4a
+
+ /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+ .octa 0x00000000db1e93b000000000feb1a192
+
+ /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+ .octa 0x000000000b96fa3a00000000d1eeedd6
+
+ /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+ .octa 0x00000001d9968af0000000008fad9bb4
+
+ /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+ .octa 0x000000000e4a77a200000001884938e4
+
+ /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+ .octa 0x00000000508c2ac800000001bc2e9bc0
+
+ /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+ .octa 0x0000000021572a8000000001f9658a68
+
+ /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+ .octa 0x00000001b859daf2000000001b9224fc
+
+ /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+ .octa 0x000000016f7884740000000055b2fb84
+
+ /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+ .octa 0x00000001b438810e000000018b090348
+
+ /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+ .octa 0x0000000095ddc6f2000000011ccbd5ea
+
+ /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+ .octa 0x00000001d977c20c0000000007ae47f8
+
+ /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+ .octa 0x00000000ebedb99a0000000172acbec0
+
+ /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+ .octa 0x00000001df9e9e9200000001c6e3ff20
+
+ /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+ .octa 0x00000001a4a3f95200000000e1b38744
+
+ /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+ .octa 0x00000000e2f5122000000000791585b2
+
+ /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+ .octa 0x000000004aa01f3e00000000ac53b894
+
+ /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+ .octa 0x00000000b3e90a5800000001ed5f2cf4
+
+ /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+ .octa 0x000000000c9ca2aa00000001df48b2e0
+
+ /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+ .octa 0x000000015168231600000000049c1c62
+
+ /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+ .octa 0x0000000036fce78c000000017c460c12
+
+ /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+ .octa 0x000000009037dc10000000015be4da7e
+
+ /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+ .octa 0x00000000d3298582000000010f38f668
+
+ /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+ .octa 0x00000001b42e8ad60000000039f40a00
+
+ /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+ .octa 0x00000000142a983800000000bd4c10c4
+
+ /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+ .octa 0x0000000109c7f1900000000042db1d98
+
+ /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+ .octa 0x0000000056ff931000000001c905bae6
+
+ /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+ .octa 0x00000001594513aa00000000069d40ea
+
+ /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+ .octa 0x00000001e3b5b1e8000000008e4fbad0
+
+ /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+ .octa 0x000000011dd5fc080000000047bedd46
+
+ /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+ .octa 0x00000001675f0cc20000000026396bf8
+
+ /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+ .octa 0x00000000d1c8dd4400000000379beb92
+
+ /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+ .octa 0x0000000115ebd3d8000000000abae54a
+
+ /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+ .octa 0x00000001ecbd0dac0000000007e6a128
+
+ /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+ .octa 0x00000000cdf67af2000000000ade29d2
+
+ /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+ .octa 0x000000004c01ff4c00000000f974c45c
+
+ /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+ .octa 0x00000000f2d8657e00000000e77ac60a
+
+ /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+ .octa 0x000000006bae74c40000000145895816
+
+ /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+ .octa 0x0000000152af8aa00000000038e362be
+
+ /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+ .octa 0x0000000004663802000000007f991a64
+
+ /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+ .octa 0x00000001ab2f5afc00000000fa366d3a
+
+ /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+ .octa 0x0000000074a4ebd400000001a2bb34f0
+
+ /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+ .octa 0x00000001d7ab3a4c0000000028a9981e
+
+ /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+ .octa 0x00000001a8da60c600000001dbc672be
+
+ /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+ .octa 0x000000013cf6382000000000b04d77f6
+
+ /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+ .octa 0x00000000bec12e1e0000000124400d96
+
+ /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+ .octa 0x00000001c6368010000000014ca4b414
+
+ /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+ .octa 0x00000001e6e78758000000012fe2c938
+
+ /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+ .octa 0x000000008d7f2b3c00000001faed01e6
+
+ /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+ .octa 0x000000016b4a156e000000007e80ecfe
+
+ /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+ .octa 0x00000001c63cfeb60000000098daee94
+
+ /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+ .octa 0x000000015f902670000000010a04edea
+
+ /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+ .octa 0x00000001cd5de11e00000001c00b4524
+
+ /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+ .octa 0x000000001acaec540000000170296550
+
+ /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+ .octa 0x000000002bd0ca780000000181afaa48
+
+ /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+ .octa 0x0000000032d63d5c0000000185a31ffa
+
+ /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+ .octa 0x000000001c6d4e4c000000002469f608
+
+ /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+ .octa 0x0000000106a60b92000000006980102a
+
+ /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+ .octa 0x00000000d3855e120000000111ea9ca8
+
+ /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+ .octa 0x00000000e312563600000001bd1d29ce
+
+ /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+ .octa 0x000000009e8f7ea400000001b34b9580
+
+ /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+ .octa 0x00000001c82e562c000000003076054e
+
+ /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+ .octa 0x00000000ca9f09ce000000012a608ea4
+
+ /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+ .octa 0x00000000c63764e600000000784d05fe
+
+ /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+ .octa 0x0000000168d2e49e000000016ef0d82a
+
+ /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+ .octa 0x00000000e986c1480000000075bda454
+
+ /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+ .octa 0x00000000cfb65894000000003dc0a1c4
+
+ /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+ .octa 0x0000000111cadee400000000e9a5d8be
+
+ /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+ .octa 0x0000000171fb63ce00000001609bc4b4
+
+.short_constants:
+
+ /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+ /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
+ .octa 0x7fec2963e5bf80485cf015c388e56f72
+
+ /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
+ .octa 0x38e888d4844752a9963a18920246e2e6
+
+ /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
+ .octa 0x42316c00730206ad419a441956993a31
+
+ /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
+ .octa 0x543d5c543e65ddf9924752ba2b830011
+
+ /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
+ .octa 0x78e87aaf56767c9255bd7f9518e4a304
+
+ /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
+ .octa 0x8f68fcec1903da7f6d76739fe0553f1e
+
+ /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
+ .octa 0x3f4840246791d588c133722b1fe0b5c3
+
+ /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
+ .octa 0x34c96751b04de25a64b67ee0e55ef1f3
+
+ /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
+ .octa 0x156c8e180b4a395b069db049b8fdb1e7
+
+ /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
+ .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
+
+ /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
+ .octa 0x041d37768cd75659817cdc5119b29a35
+
+ /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
+ .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
+
+ /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
+ .octa 0x0e148e8252377a554f256efcb82be955
+
+ /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
+ .octa 0x9c25531d19e65ddeec1631edb2dea967
+
+ /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
+ .octa 0x790606ff9957c0a65d27e147510ac59a
+
+ /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
+ .octa 0x82f63b786ea2d55ca66805eb18b8ea18
+
+
+.barrett_constants:
+ /* 33 bit reflected Barrett constant m - (4^32)/n */
+ .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
+ /* 33 bit reflected Barrett constant n */
+ .octa 0x00000000000000000000000105ec76f1
+
+#define CRC_FUNCTION_NAME __crc32c_vpmsum
+#define REFLECT
+#include "crc-vpmsum-template.S"
diff --git a/lib/crc/powerpc/crct10dif-vpmsum_asm.S b/lib/crc/powerpc/crct10dif-vpmsum_asm.S
new file mode 100644
index 000000000000..47a6266d89a8
--- /dev/null
+++ b/lib/crc/powerpc/crct10dif-vpmsum_asm.S
@@ -0,0 +1,845 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Calculate a CRC T10DIF with vpmsum acceleration
+ *
+ * Constants generated by crc32-vpmsum, available at
+ * https://github.com/antonblanchard/crc32-vpmsum
+ *
+ * crc32-vpmsum is
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ */
+ .section .rodata
+.balign 16
+
+.byteswap_constant:
+ /* byte reverse permute constant */
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+.constants:
+
+ /* Reduce 262144 kbits to 1024 bits */
+ /* x^261184 mod p(x), x^261120 mod p(x) */
+ .octa 0x0000000056d300000000000052550000
+
+ /* x^260160 mod p(x), x^260096 mod p(x) */
+ .octa 0x00000000ee67000000000000a1e40000
+
+ /* x^259136 mod p(x), x^259072 mod p(x) */
+ .octa 0x0000000060830000000000004ad10000
+
+ /* x^258112 mod p(x), x^258048 mod p(x) */
+ .octa 0x000000008cfe0000000000009ab40000
+
+ /* x^257088 mod p(x), x^257024 mod p(x) */
+ .octa 0x000000003e93000000000000fdb50000
+
+ /* x^256064 mod p(x), x^256000 mod p(x) */
+ .octa 0x000000003c2000000000000045480000
+
+ /* x^255040 mod p(x), x^254976 mod p(x) */
+ .octa 0x00000000b1fc0000000000008d690000
+
+ /* x^254016 mod p(x), x^253952 mod p(x) */
+ .octa 0x00000000f82b00000000000024ad0000
+
+ /* x^252992 mod p(x), x^252928 mod p(x) */
+ .octa 0x0000000044420000000000009f1a0000
+
+ /* x^251968 mod p(x), x^251904 mod p(x) */
+ .octa 0x00000000e88c00000000000066ec0000
+
+ /* x^250944 mod p(x), x^250880 mod p(x) */
+ .octa 0x00000000385c000000000000c87d0000
+
+ /* x^249920 mod p(x), x^249856 mod p(x) */
+ .octa 0x000000003227000000000000c8ff0000
+
+ /* x^248896 mod p(x), x^248832 mod p(x) */
+ .octa 0x00000000a9a900000000000033440000
+
+ /* x^247872 mod p(x), x^247808 mod p(x) */
+ .octa 0x00000000abaa00000000000066eb0000
+
+ /* x^246848 mod p(x), x^246784 mod p(x) */
+ .octa 0x000000001ac3000000000000c4ef0000
+
+ /* x^245824 mod p(x), x^245760 mod p(x) */
+ .octa 0x0000000063f000000000000056f30000
+
+ /* x^244800 mod p(x), x^244736 mod p(x) */
+ .octa 0x0000000032cc00000000000002050000
+
+ /* x^243776 mod p(x), x^243712 mod p(x) */
+ .octa 0x00000000f8b5000000000000568e0000
+
+ /* x^242752 mod p(x), x^242688 mod p(x) */
+ .octa 0x000000008db100000000000064290000
+
+ /* x^241728 mod p(x), x^241664 mod p(x) */
+ .octa 0x0000000059ca0000000000006b660000
+
+ /* x^240704 mod p(x), x^240640 mod p(x) */
+ .octa 0x000000005f5c00000000000018f80000
+
+ /* x^239680 mod p(x), x^239616 mod p(x) */
+ .octa 0x0000000061af000000000000b6090000
+
+ /* x^238656 mod p(x), x^238592 mod p(x) */
+ .octa 0x00000000e29e000000000000099a0000
+
+ /* x^237632 mod p(x), x^237568 mod p(x) */
+ .octa 0x000000000975000000000000a8360000
+
+ /* x^236608 mod p(x), x^236544 mod p(x) */
+ .octa 0x0000000043900000000000004f570000
+
+ /* x^235584 mod p(x), x^235520 mod p(x) */
+ .octa 0x00000000f9cd000000000000134c0000
+
+ /* x^234560 mod p(x), x^234496 mod p(x) */
+ .octa 0x000000007c29000000000000ec380000
+
+ /* x^233536 mod p(x), x^233472 mod p(x) */
+ .octa 0x000000004c6a000000000000b0d10000
+
+ /* x^232512 mod p(x), x^232448 mod p(x) */
+ .octa 0x00000000e7290000000000007d3e0000
+
+ /* x^231488 mod p(x), x^231424 mod p(x) */
+ .octa 0x00000000f1ab000000000000f0b20000
+
+ /* x^230464 mod p(x), x^230400 mod p(x) */
+ .octa 0x0000000039db0000000000009c270000
+
+ /* x^229440 mod p(x), x^229376 mod p(x) */
+ .octa 0x000000005e2800000000000092890000
+
+ /* x^228416 mod p(x), x^228352 mod p(x) */
+ .octa 0x00000000d44e000000000000d5ee0000
+
+ /* x^227392 mod p(x), x^227328 mod p(x) */
+ .octa 0x00000000cd0a00000000000041f50000
+
+ /* x^226368 mod p(x), x^226304 mod p(x) */
+ .octa 0x00000000c5b400000000000010520000
+
+ /* x^225344 mod p(x), x^225280 mod p(x) */
+ .octa 0x00000000fd2100000000000042170000
+
+ /* x^224320 mod p(x), x^224256 mod p(x) */
+ .octa 0x000000002f2500000000000095c20000
+
+ /* x^223296 mod p(x), x^223232 mod p(x) */
+ .octa 0x000000001b0100000000000001ce0000
+
+ /* x^222272 mod p(x), x^222208 mod p(x) */
+ .octa 0x000000000d430000000000002aca0000
+
+ /* x^221248 mod p(x), x^221184 mod p(x) */
+ .octa 0x0000000030a6000000000000385e0000
+
+ /* x^220224 mod p(x), x^220160 mod p(x) */
+ .octa 0x00000000e37b0000000000006f7a0000
+
+ /* x^219200 mod p(x), x^219136 mod p(x) */
+ .octa 0x00000000873600000000000024320000
+
+ /* x^218176 mod p(x), x^218112 mod p(x) */
+ .octa 0x00000000e9fb000000000000bd9c0000
+
+ /* x^217152 mod p(x), x^217088 mod p(x) */
+ .octa 0x000000003b9500000000000054bc0000
+
+ /* x^216128 mod p(x), x^216064 mod p(x) */
+ .octa 0x00000000133e000000000000a4660000
+
+ /* x^215104 mod p(x), x^215040 mod p(x) */
+ .octa 0x00000000784500000000000079930000
+
+ /* x^214080 mod p(x), x^214016 mod p(x) */
+ .octa 0x00000000b9800000000000001bb80000
+
+ /* x^213056 mod p(x), x^212992 mod p(x) */
+ .octa 0x00000000687600000000000024400000
+
+ /* x^212032 mod p(x), x^211968 mod p(x) */
+ .octa 0x00000000aff300000000000029e10000
+
+ /* x^211008 mod p(x), x^210944 mod p(x) */
+ .octa 0x0000000024b50000000000005ded0000
+
+ /* x^209984 mod p(x), x^209920 mod p(x) */
+ .octa 0x0000000017e8000000000000b12e0000
+
+ /* x^208960 mod p(x), x^208896 mod p(x) */
+ .octa 0x00000000128400000000000026d20000
+
+ /* x^207936 mod p(x), x^207872 mod p(x) */
+ .octa 0x000000002115000000000000a32a0000
+
+ /* x^206912 mod p(x), x^206848 mod p(x) */
+ .octa 0x000000009595000000000000a1210000
+
+ /* x^205888 mod p(x), x^205824 mod p(x) */
+ .octa 0x00000000281e000000000000ee8b0000
+
+ /* x^204864 mod p(x), x^204800 mod p(x) */
+ .octa 0x0000000006010000000000003d0d0000
+
+ /* x^203840 mod p(x), x^203776 mod p(x) */
+ .octa 0x00000000e2b600000000000034e90000
+
+ /* x^202816 mod p(x), x^202752 mod p(x) */
+ .octa 0x000000001bd40000000000004cdb0000
+
+ /* x^201792 mod p(x), x^201728 mod p(x) */
+ .octa 0x00000000df2800000000000030e90000
+
+ /* x^200768 mod p(x), x^200704 mod p(x) */
+ .octa 0x0000000049c200000000000042590000
+
+ /* x^199744 mod p(x), x^199680 mod p(x) */
+ .octa 0x000000009b97000000000000df950000
+
+ /* x^198720 mod p(x), x^198656 mod p(x) */
+ .octa 0x000000006184000000000000da7b0000
+
+ /* x^197696 mod p(x), x^197632 mod p(x) */
+ .octa 0x00000000461700000000000012510000
+
+ /* x^196672 mod p(x), x^196608 mod p(x) */
+ .octa 0x000000009b40000000000000f37e0000
+
+ /* x^195648 mod p(x), x^195584 mod p(x) */
+ .octa 0x00000000eeb2000000000000ecf10000
+
+ /* x^194624 mod p(x), x^194560 mod p(x) */
+ .octa 0x00000000b2e800000000000050f20000
+
+ /* x^193600 mod p(x), x^193536 mod p(x) */
+ .octa 0x00000000f59a000000000000e0b30000
+
+ /* x^192576 mod p(x), x^192512 mod p(x) */
+ .octa 0x00000000467f0000000000004d5a0000
+
+ /* x^191552 mod p(x), x^191488 mod p(x) */
+ .octa 0x00000000da92000000000000bb010000
+
+ /* x^190528 mod p(x), x^190464 mod p(x) */
+ .octa 0x000000001e1000000000000022a40000
+
+ /* x^189504 mod p(x), x^189440 mod p(x) */
+ .octa 0x0000000058fe000000000000836f0000
+
+ /* x^188480 mod p(x), x^188416 mod p(x) */
+ .octa 0x00000000b9ce000000000000d78d0000
+
+ /* x^187456 mod p(x), x^187392 mod p(x) */
+ .octa 0x0000000022210000000000004f8d0000
+
+ /* x^186432 mod p(x), x^186368 mod p(x) */
+ .octa 0x00000000744600000000000033760000
+
+ /* x^185408 mod p(x), x^185344 mod p(x) */
+ .octa 0x000000001c2e000000000000a1e50000
+
+ /* x^184384 mod p(x), x^184320 mod p(x) */
+ .octa 0x00000000dcc8000000000000a1a40000
+
+ /* x^183360 mod p(x), x^183296 mod p(x) */
+ .octa 0x00000000910f00000000000019a20000
+
+ /* x^182336 mod p(x), x^182272 mod p(x) */
+ .octa 0x0000000055d5000000000000f6ae0000
+
+ /* x^181312 mod p(x), x^181248 mod p(x) */
+ .octa 0x00000000c8ba000000000000a7ac0000
+
+ /* x^180288 mod p(x), x^180224 mod p(x) */
+ .octa 0x0000000031f8000000000000eea20000
+
+ /* x^179264 mod p(x), x^179200 mod p(x) */
+ .octa 0x000000001966000000000000c4d90000
+
+ /* x^178240 mod p(x), x^178176 mod p(x) */
+ .octa 0x00000000b9810000000000002b470000
+
+ /* x^177216 mod p(x), x^177152 mod p(x) */
+ .octa 0x000000008303000000000000f7cf0000
+
+ /* x^176192 mod p(x), x^176128 mod p(x) */
+ .octa 0x000000002ce500000000000035b30000
+
+ /* x^175168 mod p(x), x^175104 mod p(x) */
+ .octa 0x000000002fae0000000000000c7c0000
+
+ /* x^174144 mod p(x), x^174080 mod p(x) */
+ .octa 0x00000000f50c0000000000009edf0000
+
+ /* x^173120 mod p(x), x^173056 mod p(x) */
+ .octa 0x00000000714f00000000000004cd0000
+
+ /* x^172096 mod p(x), x^172032 mod p(x) */
+ .octa 0x00000000c161000000000000541b0000
+
+ /* x^171072 mod p(x), x^171008 mod p(x) */
+ .octa 0x0000000021c8000000000000e2700000
+
+ /* x^170048 mod p(x), x^169984 mod p(x) */
+ .octa 0x00000000b93d00000000000009a60000
+
+ /* x^169024 mod p(x), x^168960 mod p(x) */
+ .octa 0x00000000fbcf000000000000761c0000
+
+ /* x^168000 mod p(x), x^167936 mod p(x) */
+ .octa 0x0000000026350000000000009db30000
+
+ /* x^166976 mod p(x), x^166912 mod p(x) */
+ .octa 0x00000000b64f0000000000003e9f0000
+
+ /* x^165952 mod p(x), x^165888 mod p(x) */
+ .octa 0x00000000bd0e00000000000078590000
+
+ /* x^164928 mod p(x), x^164864 mod p(x) */
+ .octa 0x00000000d9360000000000008bc80000
+
+ /* x^163904 mod p(x), x^163840 mod p(x) */
+ .octa 0x000000002f140000000000008c9f0000
+
+ /* x^162880 mod p(x), x^162816 mod p(x) */
+ .octa 0x000000006a270000000000006af70000
+
+ /* x^161856 mod p(x), x^161792 mod p(x) */
+ .octa 0x000000006685000000000000e5210000
+
+ /* x^160832 mod p(x), x^160768 mod p(x) */
+ .octa 0x0000000062da00000000000008290000
+
+ /* x^159808 mod p(x), x^159744 mod p(x) */
+ .octa 0x00000000bb4b000000000000e4d00000
+
+ /* x^158784 mod p(x), x^158720 mod p(x) */
+ .octa 0x00000000d2490000000000004ae10000
+
+ /* x^157760 mod p(x), x^157696 mod p(x) */
+ .octa 0x00000000c85b00000000000000e70000
+
+ /* x^156736 mod p(x), x^156672 mod p(x) */
+ .octa 0x00000000c37a00000000000015650000
+
+ /* x^155712 mod p(x), x^155648 mod p(x) */
+ .octa 0x0000000018530000000000001c2f0000
+
+ /* x^154688 mod p(x), x^154624 mod p(x) */
+ .octa 0x00000000b46600000000000037bd0000
+
+ /* x^153664 mod p(x), x^153600 mod p(x) */
+ .octa 0x00000000439b00000000000012190000
+
+ /* x^152640 mod p(x), x^152576 mod p(x) */
+ .octa 0x00000000b1260000000000005ece0000
+
+ /* x^151616 mod p(x), x^151552 mod p(x) */
+ .octa 0x00000000d8110000000000002a5e0000
+
+ /* x^150592 mod p(x), x^150528 mod p(x) */
+ .octa 0x00000000099f00000000000052330000
+
+ /* x^149568 mod p(x), x^149504 mod p(x) */
+ .octa 0x00000000f9f9000000000000f9120000
+
+ /* x^148544 mod p(x), x^148480 mod p(x) */
+ .octa 0x000000005cc00000000000000ddc0000
+
+ /* x^147520 mod p(x), x^147456 mod p(x) */
+ .octa 0x00000000343b00000000000012200000
+
+ /* x^146496 mod p(x), x^146432 mod p(x) */
+ .octa 0x000000009222000000000000d12b0000
+
+ /* x^145472 mod p(x), x^145408 mod p(x) */
+ .octa 0x00000000d781000000000000eb2d0000
+
+ /* x^144448 mod p(x), x^144384 mod p(x) */
+ .octa 0x000000000bf400000000000058970000
+
+ /* x^143424 mod p(x), x^143360 mod p(x) */
+ .octa 0x00000000094200000000000013690000
+
+ /* x^142400 mod p(x), x^142336 mod p(x) */
+ .octa 0x00000000d55100000000000051950000
+
+ /* x^141376 mod p(x), x^141312 mod p(x) */
+ .octa 0x000000008f11000000000000954b0000
+
+ /* x^140352 mod p(x), x^140288 mod p(x) */
+ .octa 0x00000000140f000000000000b29e0000
+
+ /* x^139328 mod p(x), x^139264 mod p(x) */
+ .octa 0x00000000c6db000000000000db5d0000
+
+ /* x^138304 mod p(x), x^138240 mod p(x) */
+ .octa 0x00000000715b000000000000dfaf0000
+
+ /* x^137280 mod p(x), x^137216 mod p(x) */
+ .octa 0x000000000dea000000000000e3b60000
+
+ /* x^136256 mod p(x), x^136192 mod p(x) */
+ .octa 0x000000006f94000000000000ddaf0000
+
+ /* x^135232 mod p(x), x^135168 mod p(x) */
+ .octa 0x0000000024e1000000000000e4f70000
+
+ /* x^134208 mod p(x), x^134144 mod p(x) */
+ .octa 0x000000008810000000000000aa110000
+
+ /* x^133184 mod p(x), x^133120 mod p(x) */
+ .octa 0x0000000030c2000000000000a8e60000
+
+ /* x^132160 mod p(x), x^132096 mod p(x) */
+ .octa 0x00000000e6d0000000000000ccf30000
+
+ /* x^131136 mod p(x), x^131072 mod p(x) */
+ .octa 0x000000004da000000000000079bf0000
+
+ /* x^130112 mod p(x), x^130048 mod p(x) */
+ .octa 0x000000007759000000000000b3a30000
+
+ /* x^129088 mod p(x), x^129024 mod p(x) */
+ .octa 0x00000000597400000000000028790000
+
+ /* x^128064 mod p(x), x^128000 mod p(x) */
+ .octa 0x000000007acd000000000000b5820000
+
+ /* x^127040 mod p(x), x^126976 mod p(x) */
+ .octa 0x00000000e6e400000000000026ad0000
+
+ /* x^126016 mod p(x), x^125952 mod p(x) */
+ .octa 0x000000006d49000000000000985b0000
+
+ /* x^124992 mod p(x), x^124928 mod p(x) */
+ .octa 0x000000000f0800000000000011520000
+
+ /* x^123968 mod p(x), x^123904 mod p(x) */
+ .octa 0x000000002c7f000000000000846c0000
+
+ /* x^122944 mod p(x), x^122880 mod p(x) */
+ .octa 0x000000005ce7000000000000ae1d0000
+
+ /* x^121920 mod p(x), x^121856 mod p(x) */
+ .octa 0x00000000d4cb000000000000e21d0000
+
+ /* x^120896 mod p(x), x^120832 mod p(x) */
+ .octa 0x000000003a2300000000000019bb0000
+
+ /* x^119872 mod p(x), x^119808 mod p(x) */
+ .octa 0x000000000e1700000000000095290000
+
+ /* x^118848 mod p(x), x^118784 mod p(x) */
+ .octa 0x000000006e6400000000000050d20000
+
+ /* x^117824 mod p(x), x^117760 mod p(x) */
+ .octa 0x000000008d5c0000000000000cd10000
+
+ /* x^116800 mod p(x), x^116736 mod p(x) */
+ .octa 0x00000000ef310000000000007b570000
+
+ /* x^115776 mod p(x), x^115712 mod p(x) */
+ .octa 0x00000000645d00000000000053d60000
+
+ /* x^114752 mod p(x), x^114688 mod p(x) */
+ .octa 0x0000000018fc00000000000077510000
+
+ /* x^113728 mod p(x), x^113664 mod p(x) */
+ .octa 0x000000000cb3000000000000a7b70000
+
+ /* x^112704 mod p(x), x^112640 mod p(x) */
+ .octa 0x00000000991b000000000000d0780000
+
+ /* x^111680 mod p(x), x^111616 mod p(x) */
+ .octa 0x00000000845a000000000000be3c0000
+
+ /* x^110656 mod p(x), x^110592 mod p(x) */
+ .octa 0x00000000d3a9000000000000df020000
+
+ /* x^109632 mod p(x), x^109568 mod p(x) */
+ .octa 0x0000000017d7000000000000063e0000
+
+ /* x^108608 mod p(x), x^108544 mod p(x) */
+ .octa 0x000000007a860000000000008ab40000
+
+ /* x^107584 mod p(x), x^107520 mod p(x) */
+ .octa 0x00000000fd7c000000000000c7bd0000
+
+ /* x^106560 mod p(x), x^106496 mod p(x) */
+ .octa 0x00000000a56b000000000000efd60000
+
+ /* x^105536 mod p(x), x^105472 mod p(x) */
+ .octa 0x0000000010e400000000000071380000
+
+ /* x^104512 mod p(x), x^104448 mod p(x) */
+ .octa 0x00000000994500000000000004d30000
+
+ /* x^103488 mod p(x), x^103424 mod p(x) */
+ .octa 0x00000000b83c0000000000003b0e0000
+
+ /* x^102464 mod p(x), x^102400 mod p(x) */
+ .octa 0x00000000d6c10000000000008b020000
+
+ /* x^101440 mod p(x), x^101376 mod p(x) */
+ .octa 0x000000009efc000000000000da940000
+
+ /* x^100416 mod p(x), x^100352 mod p(x) */
+ .octa 0x000000005e87000000000000f9f70000
+
+ /* x^99392 mod p(x), x^99328 mod p(x) */
+ .octa 0x000000006c9b00000000000045e40000
+
+ /* x^98368 mod p(x), x^98304 mod p(x) */
+ .octa 0x00000000178a00000000000083940000
+
+ /* x^97344 mod p(x), x^97280 mod p(x) */
+ .octa 0x00000000f0c8000000000000f0a00000
+
+ /* x^96320 mod p(x), x^96256 mod p(x) */
+ .octa 0x00000000f699000000000000b74b0000
+
+ /* x^95296 mod p(x), x^95232 mod p(x) */
+ .octa 0x00000000316d000000000000c1cf0000
+
+ /* x^94272 mod p(x), x^94208 mod p(x) */
+ .octa 0x00000000987e00000000000072680000
+
+ /* x^93248 mod p(x), x^93184 mod p(x) */
+ .octa 0x00000000acff000000000000e0ab0000
+
+ /* x^92224 mod p(x), x^92160 mod p(x) */
+ .octa 0x00000000a1f6000000000000c5a80000
+
+ /* x^91200 mod p(x), x^91136 mod p(x) */
+ .octa 0x0000000061bd000000000000cf690000
+
+ /* x^90176 mod p(x), x^90112 mod p(x) */
+ .octa 0x00000000c9f2000000000000cbcc0000
+
+ /* x^89152 mod p(x), x^89088 mod p(x) */
+ .octa 0x000000005a33000000000000de050000
+
+ /* x^88128 mod p(x), x^88064 mod p(x) */
+ .octa 0x00000000e416000000000000ccd70000
+
+ /* x^87104 mod p(x), x^87040 mod p(x) */
+ .octa 0x0000000058930000000000002f670000
+
+ /* x^86080 mod p(x), x^86016 mod p(x) */
+ .octa 0x00000000a9d3000000000000152f0000
+
+ /* x^85056 mod p(x), x^84992 mod p(x) */
+ .octa 0x00000000c114000000000000ecc20000
+
+ /* x^84032 mod p(x), x^83968 mod p(x) */
+ .octa 0x00000000b9270000000000007c890000
+
+ /* x^83008 mod p(x), x^82944 mod p(x) */
+ .octa 0x000000002e6000000000000006ee0000
+
+ /* x^81984 mod p(x), x^81920 mod p(x) */
+ .octa 0x00000000dfc600000000000009100000
+
+ /* x^80960 mod p(x), x^80896 mod p(x) */
+ .octa 0x000000004911000000000000ad4e0000
+
+ /* x^79936 mod p(x), x^79872 mod p(x) */
+ .octa 0x00000000ae1b000000000000b04d0000
+
+ /* x^78912 mod p(x), x^78848 mod p(x) */
+ .octa 0x0000000005fa000000000000e9900000
+
+ /* x^77888 mod p(x), x^77824 mod p(x) */
+ .octa 0x0000000004a1000000000000cc6f0000
+
+ /* x^76864 mod p(x), x^76800 mod p(x) */
+ .octa 0x00000000af73000000000000ed110000
+
+ /* x^75840 mod p(x), x^75776 mod p(x) */
+ .octa 0x0000000082530000000000008f7e0000
+
+ /* x^74816 mod p(x), x^74752 mod p(x) */
+ .octa 0x00000000cfdc000000000000594f0000
+
+ /* x^73792 mod p(x), x^73728 mod p(x) */
+ .octa 0x00000000a6b6000000000000a8750000
+
+ /* x^72768 mod p(x), x^72704 mod p(x) */
+ .octa 0x00000000fd76000000000000aa0c0000
+
+ /* x^71744 mod p(x), x^71680 mod p(x) */
+ .octa 0x0000000006f500000000000071db0000
+
+ /* x^70720 mod p(x), x^70656 mod p(x) */
+ .octa 0x0000000037ca000000000000ab0c0000
+
+ /* x^69696 mod p(x), x^69632 mod p(x) */
+ .octa 0x00000000d7ab000000000000b7a00000
+
+ /* x^68672 mod p(x), x^68608 mod p(x) */
+ .octa 0x00000000440800000000000090d30000
+
+ /* x^67648 mod p(x), x^67584 mod p(x) */
+ .octa 0x00000000186100000000000054730000
+
+ /* x^66624 mod p(x), x^66560 mod p(x) */
+ .octa 0x000000007368000000000000a3a20000
+
+ /* x^65600 mod p(x), x^65536 mod p(x) */
+ .octa 0x0000000026d0000000000000f9040000
+
+ /* x^64576 mod p(x), x^64512 mod p(x) */
+ .octa 0x00000000fe770000000000009c0a0000
+
+ /* x^63552 mod p(x), x^63488 mod p(x) */
+ .octa 0x000000002cba000000000000d1e70000
+
+ /* x^62528 mod p(x), x^62464 mod p(x) */
+ .octa 0x00000000f8bd0000000000005ac10000
+
+ /* x^61504 mod p(x), x^61440 mod p(x) */
+ .octa 0x000000007372000000000000d68d0000
+
+ /* x^60480 mod p(x), x^60416 mod p(x) */
+ .octa 0x00000000f37f00000000000089f60000
+
+ /* x^59456 mod p(x), x^59392 mod p(x) */
+ .octa 0x00000000078400000000000008a90000
+
+ /* x^58432 mod p(x), x^58368 mod p(x) */
+ .octa 0x00000000d3e400000000000042360000
+
+ /* x^57408 mod p(x), x^57344 mod p(x) */
+ .octa 0x00000000eba800000000000092d50000
+
+ /* x^56384 mod p(x), x^56320 mod p(x) */
+ .octa 0x00000000afbe000000000000b4d50000
+
+ /* x^55360 mod p(x), x^55296 mod p(x) */
+ .octa 0x00000000d8ca000000000000c9060000
+
+ /* x^54336 mod p(x), x^54272 mod p(x) */
+ .octa 0x00000000c2d00000000000008f4f0000
+
+ /* x^53312 mod p(x), x^53248 mod p(x) */
+ .octa 0x00000000373200000000000028690000
+
+ /* x^52288 mod p(x), x^52224 mod p(x) */
+ .octa 0x0000000046ae000000000000c3b30000
+
+ /* x^51264 mod p(x), x^51200 mod p(x) */
+ .octa 0x00000000b243000000000000f8700000
+
+ /* x^50240 mod p(x), x^50176 mod p(x) */
+ .octa 0x00000000f7f500000000000029eb0000
+
+ /* x^49216 mod p(x), x^49152 mod p(x) */
+ .octa 0x000000000c7e000000000000fe730000
+
+ /* x^48192 mod p(x), x^48128 mod p(x) */
+ .octa 0x00000000c38200000000000096000000
+
+ /* x^47168 mod p(x), x^47104 mod p(x) */
+ .octa 0x000000008956000000000000683c0000
+
+ /* x^46144 mod p(x), x^46080 mod p(x) */
+ .octa 0x00000000422d0000000000005f1e0000
+
+ /* x^45120 mod p(x), x^45056 mod p(x) */
+ .octa 0x00000000ac0f0000000000006f810000
+
+ /* x^44096 mod p(x), x^44032 mod p(x) */
+ .octa 0x00000000ce30000000000000031f0000
+
+ /* x^43072 mod p(x), x^43008 mod p(x) */
+ .octa 0x000000003d43000000000000455a0000
+
+ /* x^42048 mod p(x), x^41984 mod p(x) */
+ .octa 0x000000007ebe000000000000a6050000
+
+ /* x^41024 mod p(x), x^40960 mod p(x) */
+ .octa 0x00000000976e00000000000077eb0000
+
+ /* x^40000 mod p(x), x^39936 mod p(x) */
+ .octa 0x000000000872000000000000389c0000
+
+ /* x^38976 mod p(x), x^38912 mod p(x) */
+ .octa 0x000000008979000000000000c7b20000
+
+ /* x^37952 mod p(x), x^37888 mod p(x) */
+ .octa 0x000000005c1e0000000000001d870000
+
+ /* x^36928 mod p(x), x^36864 mod p(x) */
+ .octa 0x00000000aebb00000000000045810000
+
+ /* x^35904 mod p(x), x^35840 mod p(x) */
+ .octa 0x000000004f7e0000000000006d4a0000
+
+ /* x^34880 mod p(x), x^34816 mod p(x) */
+ .octa 0x00000000ea98000000000000b9200000
+
+ /* x^33856 mod p(x), x^33792 mod p(x) */
+ .octa 0x00000000f39600000000000022f20000
+
+ /* x^32832 mod p(x), x^32768 mod p(x) */
+ .octa 0x000000000bc500000000000041ca0000
+
+ /* x^31808 mod p(x), x^31744 mod p(x) */
+ .octa 0x00000000786400000000000078500000
+
+ /* x^30784 mod p(x), x^30720 mod p(x) */
+ .octa 0x00000000be970000000000009e7e0000
+
+ /* x^29760 mod p(x), x^29696 mod p(x) */
+ .octa 0x00000000dd6d000000000000a53c0000
+
+ /* x^28736 mod p(x), x^28672 mod p(x) */
+ .octa 0x000000004c3f00000000000039340000
+
+ /* x^27712 mod p(x), x^27648 mod p(x) */
+ .octa 0x0000000093a4000000000000b58e0000
+
+ /* x^26688 mod p(x), x^26624 mod p(x) */
+ .octa 0x0000000050fb00000000000062d40000
+
+ /* x^25664 mod p(x), x^25600 mod p(x) */
+ .octa 0x00000000f505000000000000a26f0000
+
+ /* x^24640 mod p(x), x^24576 mod p(x) */
+ .octa 0x0000000064f900000000000065e60000
+
+ /* x^23616 mod p(x), x^23552 mod p(x) */
+ .octa 0x00000000e8c2000000000000aad90000
+
+ /* x^22592 mod p(x), x^22528 mod p(x) */
+ .octa 0x00000000720b000000000000a3b00000
+
+ /* x^21568 mod p(x), x^21504 mod p(x) */
+ .octa 0x00000000e992000000000000d2680000
+
+ /* x^20544 mod p(x), x^20480 mod p(x) */
+ .octa 0x000000009132000000000000cf4c0000
+
+ /* x^19520 mod p(x), x^19456 mod p(x) */
+ .octa 0x00000000608a00000000000076610000
+
+ /* x^18496 mod p(x), x^18432 mod p(x) */
+ .octa 0x000000009948000000000000fb9f0000
+
+ /* x^17472 mod p(x), x^17408 mod p(x) */
+ .octa 0x00000000173000000000000003770000
+
+ /* x^16448 mod p(x), x^16384 mod p(x) */
+ .octa 0x000000006fe300000000000004880000
+
+ /* x^15424 mod p(x), x^15360 mod p(x) */
+ .octa 0x00000000e15300000000000056a70000
+
+ /* x^14400 mod p(x), x^14336 mod p(x) */
+ .octa 0x0000000092d60000000000009dfd0000
+
+ /* x^13376 mod p(x), x^13312 mod p(x) */
+ .octa 0x0000000002fd00000000000074c80000
+
+ /* x^12352 mod p(x), x^12288 mod p(x) */
+ .octa 0x00000000c78b000000000000a3ec0000
+
+ /* x^11328 mod p(x), x^11264 mod p(x) */
+ .octa 0x000000009262000000000000b3530000
+
+ /* x^10304 mod p(x), x^10240 mod p(x) */
+ .octa 0x0000000084f200000000000047bf0000
+
+ /* x^9280 mod p(x), x^9216 mod p(x) */
+ .octa 0x0000000067ee000000000000e97c0000
+
+ /* x^8256 mod p(x), x^8192 mod p(x) */
+ .octa 0x00000000535b00000000000091e10000
+
+ /* x^7232 mod p(x), x^7168 mod p(x) */
+ .octa 0x000000007ebb00000000000055060000
+
+ /* x^6208 mod p(x), x^6144 mod p(x) */
+ .octa 0x00000000c6a1000000000000fd360000
+
+ /* x^5184 mod p(x), x^5120 mod p(x) */
+ .octa 0x000000001be500000000000055860000
+
+ /* x^4160 mod p(x), x^4096 mod p(x) */
+ .octa 0x00000000ae0e0000000000005bd00000
+
+ /* x^3136 mod p(x), x^3072 mod p(x) */
+ .octa 0x0000000022040000000000008db20000
+
+ /* x^2112 mod p(x), x^2048 mod p(x) */
+ .octa 0x00000000c9eb000000000000efe20000
+
+ /* x^1088 mod p(x), x^1024 mod p(x) */
+ .octa 0x0000000039b400000000000051d10000
+
+.short_constants:
+
+ /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+ /* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */
+ .octa 0xefe20000dccf00009440000033590000
+
+ /* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */
+ .octa 0xee6300002f3f000062180000e0ed0000
+
+ /* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */
+ .octa 0xcf5f000017ef0000ccbe000023d30000
+
+ /* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */
+ .octa 0x6d0c0000a30e00000920000042630000
+
+ /* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */
+ .octa 0x21d30000932b0000a7a00000efcc0000
+
+ /* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */
+ .octa 0x10be00000b310000666f00000d1c0000
+
+ /* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */
+ .octa 0x1f240000ce9e0000caad0000589e0000
+
+ /* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */
+ .octa 0x29610000d02b000039b400007cf50000
+
+ /* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */
+ .octa 0x51d100009d9d00003c0e0000bfd60000
+
+ /* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */
+ .octa 0xda390000ceae000013830000713c0000
+
+ /* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */
+ .octa 0xb67800001e16000085c0000080a60000
+
+ /* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */
+ .octa 0x0db40000f7f90000371d0000e6580000
+
+ /* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */
+ .octa 0x87e70000044c0000aadb0000a4970000
+
+ /* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */
+ .octa 0x1f990000ad180000d8b30000e7b50000
+
+ /* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */
+ .octa 0xbe6c00006ee300004c1a000006df0000
+
+ /* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */
+ .octa 0xfb0b00002d560000136800008bb70000
+
+
+.barrett_constants:
+ /* Barrett constant m - (4^32)/n */
+ .octa 0x000000000000000000000001f65a57f8 /* x^64 div p(x) */
+ /* Barrett constant n */
+ .octa 0x0000000000000000000000018bb70000
+
+#define CRC_FUNCTION_NAME __crct10dif_vpmsum
+#include "crc-vpmsum-template.S"
diff --git a/lib/crc/riscv/crc-clmul-consts.h b/lib/crc/riscv/crc-clmul-consts.h
new file mode 100644
index 000000000000..8d73449235ef
--- /dev/null
+++ b/lib/crc/riscv/crc-clmul-consts.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * CRC constants generated by:
+ *
+ * ./scripts/gen-crc-consts.py riscv_clmul crc16_msb_0x8bb7,crc32_msb_0x04c11db7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
+ *
+ * Do not edit manually.
+ */
+
+struct crc_clmul_consts {
+ unsigned long fold_across_2_longs_const_hi;
+ unsigned long fold_across_2_longs_const_lo;
+ unsigned long barrett_reduction_const_1;
+ unsigned long barrett_reduction_const_2;
+};
+
+/*
+ * Constants generated for most-significant-bit-first CRC-16 using
+ * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct crc_clmul_consts crc16_msb_0x8bb7_consts __maybe_unused = {
+#ifdef CONFIG_64BIT
+ .fold_across_2_longs_const_hi = 0x0000000000001faa, /* x^192 mod G */
+ .fold_across_2_longs_const_lo = 0x000000000000a010, /* x^128 mod G */
+ .barrett_reduction_const_1 = 0xfb2d2bfc0e99d245, /* floor(x^79 / G) */
+ .barrett_reduction_const_2 = 0x0000000000008bb7, /* G - x^16 */
+#else
+ .fold_across_2_longs_const_hi = 0x00005890, /* x^96 mod G */
+ .fold_across_2_longs_const_lo = 0x0000f249, /* x^64 mod G */
+ .barrett_reduction_const_1 = 0xfb2d2bfc, /* floor(x^47 / G) */
+ .barrett_reduction_const_2 = 0x00008bb7, /* G - x^16 */
+#endif
+};
+
+/*
+ * Constants generated for most-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ * x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct crc_clmul_consts crc32_msb_0x04c11db7_consts __maybe_unused = {
+#ifdef CONFIG_64BIT
+ .fold_across_2_longs_const_hi = 0x00000000c5b9cd4c, /* x^192 mod G */
+ .fold_across_2_longs_const_lo = 0x00000000e8a45605, /* x^128 mod G */
+ .barrett_reduction_const_1 = 0x826880efa40da72d, /* floor(x^95 / G) */
+ .barrett_reduction_const_2 = 0x0000000004c11db7, /* G - x^32 */
+#else
+ .fold_across_2_longs_const_hi = 0xf200aa66, /* x^96 mod G */
+ .fold_across_2_longs_const_lo = 0x490d678d, /* x^64 mod G */
+ .barrett_reduction_const_1 = 0x826880ef, /* floor(x^63 / G) */
+ .barrett_reduction_const_2 = 0x04c11db7, /* G - x^32 */
+#endif
+};
+
+/*
+ * Constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ * x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct crc_clmul_consts crc32_lsb_0xedb88320_consts __maybe_unused = {
+#ifdef CONFIG_64BIT
+ .fold_across_2_longs_const_hi = 0x65673b4600000000, /* x^191 mod G */
+ .fold_across_2_longs_const_lo = 0x9ba54c6f00000000, /* x^127 mod G */
+ .barrett_reduction_const_1 = 0xb4e5b025f7011641, /* floor(x^95 / G) */
+ .barrett_reduction_const_2 = 0x00000000edb88320, /* (G - x^32) * x^32 */
+#else
+ .fold_across_2_longs_const_hi = 0xccaa009e, /* x^95 mod G */
+ .fold_across_2_longs_const_lo = 0xb8bc6765, /* x^63 mod G */
+ .barrett_reduction_const_1 = 0xf7011641, /* floor(x^63 / G) */
+ .barrett_reduction_const_2 = 0xedb88320, /* (G - x^32) * x^0 */
+#endif
+};
+
+/*
+ * Constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 +
+ * x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0
+ */
+static const struct crc_clmul_consts crc32_lsb_0x82f63b78_consts __maybe_unused = {
+#ifdef CONFIG_64BIT
+ .fold_across_2_longs_const_hi = 0x3743f7bd00000000, /* x^191 mod G */
+ .fold_across_2_longs_const_lo = 0x3171d43000000000, /* x^127 mod G */
+ .barrett_reduction_const_1 = 0x4869ec38dea713f1, /* floor(x^95 / G) */
+ .barrett_reduction_const_2 = 0x0000000082f63b78, /* (G - x^32) * x^32 */
+#else
+ .fold_across_2_longs_const_hi = 0x493c7d27, /* x^95 mod G */
+ .fold_across_2_longs_const_lo = 0xdd45aab8, /* x^63 mod G */
+ .barrett_reduction_const_1 = 0xdea713f1, /* floor(x^63 / G) */
+ .barrett_reduction_const_2 = 0x82f63b78, /* (G - x^32) * x^0 */
+#endif
+};
+
+/*
+ * Constants generated for most-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x^1 + x^0
+ */
+#ifdef CONFIG_64BIT
+static const struct crc_clmul_consts crc64_msb_0x42f0e1eba9ea3693_consts __maybe_unused = {
+ .fold_across_2_longs_const_hi = 0x4eb938a7d257740e, /* x^192 mod G */
+ .fold_across_2_longs_const_lo = 0x05f5c3c7eb52fab6, /* x^128 mod G */
+ .barrett_reduction_const_1 = 0xabc694e836627c39, /* floor(x^127 / G) */
+ .barrett_reduction_const_2 = 0x42f0e1eba9ea3693, /* G - x^64 */
+};
+#endif
+
+/*
+ * Constants generated for least-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
+ * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
+ * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
+ * x^4 + x^3 + x^0
+ */
+#ifdef CONFIG_64BIT
+static const struct crc_clmul_consts crc64_lsb_0x9a6c9329ac4bc9b5_consts __maybe_unused = {
+ .fold_across_2_longs_const_hi = 0xeadc41fd2ba3d420, /* x^191 mod G */
+ .fold_across_2_longs_const_lo = 0x21e9761e252621ac, /* x^127 mod G */
+ .barrett_reduction_const_1 = 0x27ecfa329aef9f77, /* floor(x^127 / G) */
+ .barrett_reduction_const_2 = 0x9a6c9329ac4bc9b5, /* (G - x^64) * x^0 */
+};
+#endif
diff --git a/lib/crc/riscv/crc-clmul-template.h b/lib/crc/riscv/crc-clmul-template.h
new file mode 100644
index 000000000000..77187e7f1762
--- /dev/null
+++ b/lib/crc/riscv/crc-clmul-template.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright 2025 Google LLC */
+
+/*
+ * This file is a "template" that generates a CRC function optimized using the
+ * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this
+ * file must define the following parameters to specify the type of CRC:
+ *
+ * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
+ * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
+ * mapping between bits and polynomial coefficients
+ * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected
+ * mapping between bits and polynomial coefficients
+ */
+
+#include <asm/byteorder.h>
+#include <linux/minmax.h>
+
+#define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */
+
+static inline unsigned long clmul(unsigned long a, unsigned long b)
+{
+ unsigned long res;
+
+ asm(".option push\n"
+ ".option arch,+zbc\n"
+ "clmul %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (res) : "r" (a), "r" (b));
+ return res;
+}
+
+static inline unsigned long clmulh(unsigned long a, unsigned long b)
+{
+ unsigned long res;
+
+ asm(".option push\n"
+ ".option arch,+zbc\n"
+ "clmulh %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (res) : "r" (a), "r" (b));
+ return res;
+}
+
+static inline unsigned long clmulr(unsigned long a, unsigned long b)
+{
+ unsigned long res;
+
+ asm(".option push\n"
+ ".option arch,+zbc\n"
+ "clmulr %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (res) : "r" (a), "r" (b));
+ return res;
+}
+
+/*
+ * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
+ * polynomial whose bit order matches the CRC's bit order.
+ */
+#ifdef CONFIG_64BIT
+# if LSB_CRC
+# define crc_load_long(x) le64_to_cpup(x)
+# else
+# define crc_load_long(x) be64_to_cpup(x)
+# endif
+#else
+# if LSB_CRC
+# define crc_load_long(x) le32_to_cpup(x)
+# else
+# define crc_load_long(x) be32_to_cpup(x)
+# endif
+#endif
+
+/* XOR @crc into the end of @msgpoly that represents the high-order terms. */
+static inline unsigned long
+crc_clmul_prep(crc_t crc, unsigned long msgpoly)
+{
+#if LSB_CRC
+ return msgpoly ^ crc;
+#else
+ return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
+#endif
+}
+
+/*
+ * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
+ * modulo the generator polynomial G. This gives the CRC of @msgpoly.
+ */
+static inline crc_t
+crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
+{
+ unsigned long tmp;
+
+ /*
+ * First step of Barrett reduction with integrated multiplication by
+ * x^n: calculate floor((msgpoly * x^n) / G). This is the value by
+ * which G needs to be multiplied to cancel out the x^n and higher terms
+ * of msgpoly * x^n. Do it using the following formula:
+ *
+ * msb-first:
+ * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
+ * lsb-first:
+ * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
+ *
+ * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
+ * which fits a long exactly. Using any lower power of x there would
+ * not carry enough precision through the calculation, while using any
+ * higher power of x would require extra instructions to handle a wider
+ * multiplication. In the msb-first case, using this power of x results
+ * in needing a floored division by x^(BITS_PER_LONG-1), which matches
+ * what clmulr produces. In the lsb-first case, a factor of x gets
+ * implicitly introduced by each carryless multiplication (shown as
+ * '* x' above), and the floored division instead needs to be by
+ * x^BITS_PER_LONG which matches what clmul produces.
+ */
+#if LSB_CRC
+ tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
+#else
+ tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);
+#endif
+
+ /*
+ * Second step of Barrett reduction:
+ *
+ * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
+ *
+ * This reduces (msgpoly * x^n) modulo G by adding the appropriate
+ * multiple of G to it. The result uses only the x^0..x^(n-1) terms.
+ * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
+ * terms in the first place, it is more efficient to do the equivalent:
+ *
+ * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
+ *
+ * In the lsb-first case further modify it to the following which avoids
+ * a shift, as the crc ends up in the physically low n bits from clmulr:
+ *
+ * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
+ * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
+ *
+ * barrett_reduction_const_2 contains the constant multiplier (G - x^n)
+ * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The
+ * cast of the result to crc_t is essential, as it applies the mod x^n!
+ */
+#if LSB_CRC
+ return clmulr(tmp, consts->barrett_reduction_const_2);
+#else
+ return clmul(tmp, consts->barrett_reduction_const_2);
+#endif
+}
+
+/* Update @crc with the data from @msgpoly. */
+static inline crc_t
+crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);
+}
+
+/* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */
+static inline crc_t
+crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ unsigned long msgpoly;
+ size_t i;
+
+#if LSB_CRC
+ msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);
+ for (i = 1; i < len; i++)
+ msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));
+#else
+ msgpoly = p[0];
+ for (i = 1; i < len; i++)
+ msgpoly = (msgpoly << 8) ^ p[i];
+#endif
+
+ if (len >= sizeof(crc_t)) {
+ #if LSB_CRC
+ msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
+ #else
+ msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);
+ #endif
+ return crc_clmul_long(msgpoly, consts);
+ }
+#if LSB_CRC
+ msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
+ return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));
+#else
+ msgpoly ^= crc >> (CRC_BITS - 8*len);
+ return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));
+#endif
+}
+
+static inline crc_t
+crc_clmul(crc_t crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ size_t align;
+
+ /* This implementation assumes that the CRC fits in an unsigned long. */
+ BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));
+
+ /* If the buffer is not long-aligned, align it. */
+ align = (unsigned long)p % sizeof(unsigned long);
+ if (align && len) {
+ align = min(sizeof(unsigned long) - align, len);
+ crc = crc_clmul_update_partial(crc, p, align, consts);
+ p += align;
+ len -= align;
+ }
+
+ if (len >= 4 * sizeof(unsigned long)) {
+ unsigned long m0, m1;
+
+ m0 = crc_clmul_prep(crc, crc_load_long(p));
+ m1 = crc_load_long(p + sizeof(unsigned long));
+ p += 2 * sizeof(unsigned long);
+ len -= 2 * sizeof(unsigned long);
+ /*
+ * Main loop. Each iteration starts with a message polynomial
+ * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
+ * more longs of data to form x^(3*BITS_PER_LONG)*m0 +
+ * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then
+ * "folds" that back into a congruent (modulo G) value that uses
+ * just m0 and m1 again. This is done by multiplying m0 by the
+ * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
+ * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
+ * adding the results to m2 and m3 as appropriate. Each such
+ * multiplication produces a result twice the length of a long,
+ * which in RISC-V is two instructions clmul and clmulh.
+ *
+ * This could be changed to fold across more than 2 longs at a
+ * time if there is a CPU that can take advantage of it.
+ */
+ do {
+ unsigned long p0, p1, p2, p3;
+
+ p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);
+ p1 = clmul(m0, consts->fold_across_2_longs_const_hi);
+ p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);
+ p3 = clmul(m1, consts->fold_across_2_longs_const_lo);
+ m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
+ m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
+ crc_load_long(p + sizeof(unsigned long));
+
+ p += 2 * sizeof(unsigned long);
+ len -= 2 * sizeof(unsigned long);
+ } while (len >= 2 * sizeof(unsigned long));
+
+ crc = crc_clmul_long(m0, consts);
+ crc = crc_clmul_update_long(crc, m1, consts);
+ }
+
+ while (len >= sizeof(unsigned long)) {
+ crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
+ p += sizeof(unsigned long);
+ len -= sizeof(unsigned long);
+ }
+
+ if (len)
+ crc = crc_clmul_update_partial(crc, p, len, consts);
+
+ return crc;
+}
diff --git a/lib/crc/riscv/crc-clmul.h b/lib/crc/riscv/crc-clmul.h
new file mode 100644
index 000000000000..dd1736245815
--- /dev/null
+++ b/lib/crc/riscv/crc-clmul.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright 2025 Google LLC */
+
+#ifndef _RISCV_CRC_CLMUL_H
+#define _RISCV_CRC_CLMUL_H
+
+#include <linux/types.h>
+#include "crc-clmul-consts.h"
+
+u16 crc16_msb_clmul(u16 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts);
+u32 crc32_msb_clmul(u32 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts);
+u32 crc32_lsb_clmul(u32 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts);
+#ifdef CONFIG_64BIT
+u64 crc64_msb_clmul(u64 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts);
+u64 crc64_lsb_clmul(u64 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts);
+#endif
+
+#endif /* _RISCV_CRC_CLMUL_H */
diff --git a/lib/crc/riscv/crc-t10dif.h b/lib/crc/riscv/crc-t10dif.h
new file mode 100644
index 000000000000..cd6136cbfda1
--- /dev/null
+++ b/lib/crc/riscv/crc-t10dif.h
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized CRC-T10DIF function
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
+
+#include "crc-clmul.h"
+
+static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc16_msb_clmul(crc, p, len, &crc16_msb_0x8bb7_consts);
+ return crc_t10dif_generic(crc, p, len);
+}
diff --git a/lib/crc/riscv/crc16_msb.c b/lib/crc/riscv/crc16_msb.c
new file mode 100644
index 000000000000..554d295e95f5
--- /dev/null
+++ b/lib/crc/riscv/crc16_msb.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized most-significant-bit-first CRC16
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-clmul.h"
+
+typedef u16 crc_t;
+#define LSB_CRC 0
+#include "crc-clmul-template.h"
+
+u16 crc16_msb_clmul(u16 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul(crc, p, len, consts);
+}
diff --git a/lib/crc/riscv/crc32.h b/lib/crc/riscv/crc32.h
new file mode 100644
index 000000000000..3ec6eee98afa
--- /dev/null
+++ b/lib/crc/riscv/crc32.h
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized CRC32 functions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
+
+#include "crc-clmul.h"
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc32_lsb_clmul(crc, p, len,
+ &crc32_lsb_0xedb88320_consts);
+ return crc32_le_base(crc, p, len);
+}
+
+static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc32_msb_clmul(crc, p, len,
+ &crc32_msb_0x04c11db7_consts);
+ return crc32_be_base(crc, p, len);
+}
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc32_lsb_clmul(crc, p, len,
+ &crc32_lsb_0x82f63b78_consts);
+ return crc32c_base(crc, p, len);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return CRC32_LE_OPTIMIZATION |
+ CRC32_BE_OPTIMIZATION |
+ CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/riscv/crc32_lsb.c b/lib/crc/riscv/crc32_lsb.c
new file mode 100644
index 000000000000..72fd67e7470c
--- /dev/null
+++ b/lib/crc/riscv/crc32_lsb.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized least-significant-bit-first CRC32
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-clmul.h"
+
+typedef u32 crc_t;
+#define LSB_CRC 1
+#include "crc-clmul-template.h"
+
+u32 crc32_lsb_clmul(u32 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul(crc, p, len, consts);
+}
diff --git a/lib/crc/riscv/crc32_msb.c b/lib/crc/riscv/crc32_msb.c
new file mode 100644
index 000000000000..fdbeaccc369f
--- /dev/null
+++ b/lib/crc/riscv/crc32_msb.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized most-significant-bit-first CRC32
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-clmul.h"
+
+typedef u32 crc_t;
+#define LSB_CRC 0
+#include "crc-clmul-template.h"
+
+u32 crc32_msb_clmul(u32 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul(crc, p, len, consts);
+}
diff --git a/lib/crc/riscv/crc64.h b/lib/crc/riscv/crc64.h
new file mode 100644
index 000000000000..a1b7873fde57
--- /dev/null
+++ b/lib/crc/riscv/crc64.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized CRC64 functions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
+
+#include "crc-clmul.h"
+
+static inline u64 crc64_be_arch(u64 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc64_msb_clmul(crc, p, len,
+ &crc64_msb_0x42f0e1eba9ea3693_consts);
+ return crc64_be_generic(crc, p, len);
+}
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc64_lsb_clmul(crc, p, len,
+ &crc64_lsb_0x9a6c9329ac4bc9b5_consts);
+ return crc64_nvme_generic(crc, p, len);
+}
diff --git a/lib/crc/riscv/crc64_lsb.c b/lib/crc/riscv/crc64_lsb.c
new file mode 100644
index 000000000000..c5371bb85d90
--- /dev/null
+++ b/lib/crc/riscv/crc64_lsb.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized least-significant-bit-first CRC64
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-clmul.h"
+
+typedef u64 crc_t;
+#define LSB_CRC 1
+#include "crc-clmul-template.h"
+
+u64 crc64_lsb_clmul(u64 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul(crc, p, len, consts);
+}
diff --git a/lib/crc/riscv/crc64_msb.c b/lib/crc/riscv/crc64_msb.c
new file mode 100644
index 000000000000..1925d1dbe225
--- /dev/null
+++ b/lib/crc/riscv/crc64_msb.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized most-significant-bit-first CRC64
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-clmul.h"
+
+typedef u64 crc_t;
+#define LSB_CRC 0
+#include "crc-clmul-template.h"
+
+u64 crc64_msb_clmul(u64 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul(crc, p, len, consts);
+}
diff --git a/lib/crc/s390/crc32-vx.h b/lib/crc/s390/crc32-vx.h
new file mode 100644
index 000000000000..652c96e1a822
--- /dev/null
+++ b/lib/crc/s390/crc32-vx.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _CRC32_VX_S390_H
+#define _CRC32_VX_S390_H
+
+#include <linux/types.h>
+
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+
+#endif /* _CRC32_VX_S390_H */
diff --git a/lib/crc/s390/crc32.h b/lib/crc/s390/crc32.h
new file mode 100644
index 000000000000..59c8983d428b
--- /dev/null
+++ b/lib/crc/s390/crc32.h
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CRC-32 implemented with the z/Architecture Vector Extension Facility.
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#include <linux/cpufeature.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+/*
+ * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
+ *
+ * Creates a function to perform a particular CRC-32 computation. Depending
+ * on the message buffer, the hardware-accelerated or software implementation
+ * is used. Note that the message buffer is aligned to improve fetch
+ * operations of VECTOR LOAD MULTIPLE instructions.
+ */
+#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \
+ static inline u32 ___fname(u32 crc, const u8 *data, size_t datalen) \
+ { \
+ unsigned long prealign, aligned, remaining; \
+ DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \
+ \
+ if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || !cpu_has_vx()) \
+ return ___crc32_sw(crc, data, datalen); \
+ \
+ if ((unsigned long)data & VX_ALIGN_MASK) { \
+ prealign = VX_ALIGNMENT - \
+ ((unsigned long)data & VX_ALIGN_MASK); \
+ datalen -= prealign; \
+ crc = ___crc32_sw(crc, data, prealign); \
+ data = (void *)((unsigned long)data + prealign); \
+ } \
+ \
+ aligned = datalen & ~VX_ALIGN_MASK; \
+ remaining = datalen & VX_ALIGN_MASK; \
+ \
+ kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \
+ crc = ___crc32_vx(crc, data, aligned); \
+ kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \
+ \
+ if (remaining) \
+ crc = ___crc32_sw(crc, data + aligned, remaining); \
+ \
+ return crc; \
+ }
+
+DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base)
+DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base)
+DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base)
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (cpu_has_vx()) {
+ return CRC32_LE_OPTIMIZATION |
+ CRC32_BE_OPTIMIZATION |
+ CRC32C_OPTIMIZATION;
+ }
+ return 0;
+}
diff --git a/lib/crc/s390/crc32be-vx.c b/lib/crc/s390/crc32be-vx.c
new file mode 100644
index 000000000000..fed7c9c70d05
--- /dev/null
+++ b/lib/crc/s390/crc32be-vx.c
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm processes the most-significant
+ * bit first (BE).
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"
+
+/* Vector register range containing CRC-32 constants */
+#define CONST_R1R2 9
+#define CONST_R3R4 10
+#define CONST_R5 11
+#define CONST_R6 12
+#define CONST_RU_POLY 13
+#define CONST_CRC_POLY 14
+
+/*
+ * The CRC-32 constant block contains reduction constants to fold and
+ * process particular chunks of the input data stream in parallel.
+ *
+ * For the CRC-32 variants, the constants are precomputed according to
+ * these definitions:
+ *
+ * R1 = x4*128+64 mod P(x)
+ * R2 = x4*128 mod P(x)
+ * R3 = x128+64 mod P(x)
+ * R4 = x128 mod P(x)
+ * R5 = x96 mod P(x)
+ * R6 = x64 mod P(x)
+ *
+ * Barret reduction constant, u, is defined as floor(x**64 / P(x)).
+ *
+ * where P(x) is the polynomial in the normal domain and the P'(x) is the
+ * polynomial in the reversed (bitreflected) domain.
+ *
+ * Note that the constant definitions below are extended in order to compute
+ * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
+ * The rightmost doubleword can be 0 to prevent contribution to the result or
+ * can be multiplied by 1 to perform an XOR without the need for a separate
+ * VECTOR EXCLUSIVE OR instruction.
+ *
+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+ *
+ * P(x) = 0x04C11DB7
+ * P'(x) = 0xEDB88320
+ */
+
+static unsigned long constants_CRC_32_BE[] = {
+ 0x08833794c, 0x0e6228b11, /* R1, R2 */
+ 0x0c5b9cd4c, 0x0e8a45605, /* R3, R4 */
+ 0x0f200aa66, 1UL << 32, /* R5, x32 */
+ 0x0490d678d, 1, /* R6, 1 */
+ 0x104d101df, 0, /* u */
+ 0x104C11DB7, 0, /* P(x) */
+};
+
+/**
+ * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers
+ * @crc: Initial CRC value, typically ~0.
+ * @buf: Input buffer pointer, performance might be improved if the
+ * buffer is on a doubleword boundary.
+ * @size: Size of the buffer, must be 64 bytes or greater.
+ *
+ * Register usage:
+ * V0: Initial CRC value and intermediate constants and results.
+ * V1..V4: Data for CRC computation.
+ * V5..V8: Next data chunks that are fetched from the input buffer.
+ * V9..V14: CRC-32 constants.
+ */
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+ /* Load CRC-32 constants */
+ fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE);
+ fpu_vzero(0);
+
+ /* Load the initial CRC value into the leftmost word of V0. */
+ fpu_vlvgf(0, crc, 0);
+
+ /* Load a 64-byte data chunk and XOR with CRC */
+ fpu_vlm(1, 4, buf);
+ fpu_vx(1, 0, 1);
+ buf += 64;
+ size -= 64;
+
+ while (size >= 64) {
+ /* Load the next 64-byte data chunk into V5 to V8 */
+ fpu_vlm(5, 8, buf);
+
+ /*
+ * Perform a GF(2) multiplication of the doublewords in V1 with
+ * the reduction constants in V0. The intermediate result is
+ * then folded (accumulated) with the next data chunk in V5 and
+ * stored in V1. Repeat this step for the register contents
+ * in V2, V3, and V4 respectively.
+ */
+ fpu_vgfmag(1, CONST_R1R2, 1, 5);
+ fpu_vgfmag(2, CONST_R1R2, 2, 6);
+ fpu_vgfmag(3, CONST_R1R2, 3, 7);
+ fpu_vgfmag(4, CONST_R1R2, 4, 8);
+ buf += 64;
+ size -= 64;
+ }
+
+ /* Fold V1 to V4 into a single 128-bit value in V1 */
+ fpu_vgfmag(1, CONST_R3R4, 1, 2);
+ fpu_vgfmag(1, CONST_R3R4, 1, 3);
+ fpu_vgfmag(1, CONST_R3R4, 1, 4);
+
+ while (size >= 16) {
+ fpu_vl(2, buf);
+ fpu_vgfmag(1, CONST_R3R4, 1, 2);
+ buf += 16;
+ size -= 16;
+ }
+
+ /*
+ * The R5 constant is used to fold a 128-bit value into an 96-bit value
+ * that is XORed with the next 96-bit input data chunk. To use a single
+ * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to
+ * form an intermediate 96-bit value (with appended zeros) which is then
+ * XORed with the intermediate reduction result.
+ */
+ fpu_vgfmg(1, CONST_R5, 1);
+
+ /*
+ * Further reduce the remaining 96-bit value to a 64-bit value using a
+ * single VGFMG, the rightmost doubleword is multiplied with 0x1. The
+ * intermediate result is then XORed with the product of the leftmost
+ * doubleword with R6. The result is a 64-bit value and is subject to
+ * the Barret reduction.
+ */
+ fpu_vgfmg(1, CONST_R6, 1);
+
+ /*
+ * The input values to the Barret reduction are the degree-63 polynomial
+ * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+ * constant u. The Barret reduction result is the CRC value of R(x) mod
+ * P(x).
+ *
+ * The Barret reduction algorithm is defined as:
+ *
+ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+ * 3. C(x) = R(x) XOR T2(x) mod x^32
+ *
+ * Note: To compensate the division by x^32, use the vector unpack
+ * instruction to move the leftmost word into the leftmost doubleword
+ * of the vector register. The rightmost doubleword is multiplied
+ * with zero to not contribute to the intermediate results.
+ */
+
+ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+ fpu_vupllf(2, 1);
+ fpu_vgfmg(2, CONST_RU_POLY, 2);
+
+ /*
+ * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
+ * V2 and XOR the intermediate result, T2(x), with the value in V1.
+ * The final result is in the rightmost word of V2.
+ */
+ fpu_vupllf(2, 2);
+ fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
+ return fpu_vlgvf(2, 3);
+}
diff --git a/lib/crc/s390/crc32le-vx.c b/lib/crc/s390/crc32le-vx.c
new file mode 100644
index 000000000000..2f629f394df7
--- /dev/null
+++ b/lib/crc/s390/crc32le-vx.c
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet
+ * and Castagnoli.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"
+
+/* Vector register range containing CRC-32 constants */
+#define CONST_PERM_LE2BE 9
+#define CONST_R2R1 10
+#define CONST_R4R3 11
+#define CONST_R5 12
+#define CONST_RU_POLY 13
+#define CONST_CRC_POLY 14
+
+/*
+ * The CRC-32 constant block contains reduction constants to fold and
+ * process particular chunks of the input data stream in parallel.
+ *
+ * For the CRC-32 variants, the constants are precomputed according to
+ * these definitions:
+ *
+ * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+ * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+ * R3 = [(x128+32 mod P'(x) << 32)]' << 1
+ * R4 = [(x128-32 mod P'(x) << 32)]' << 1
+ * R5 = [(x64 mod P'(x) << 32)]' << 1
+ * R6 = [(x32 mod P'(x) << 32)]' << 1
+ *
+ * The bitreflected Barret reduction constant, u', is defined as
+ * the bit reversal of floor(x**64 / P(x)).
+ *
+ * where P(x) is the polynomial in the normal domain and the P'(x) is the
+ * polynomial in the reversed (bitreflected) domain.
+ *
+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+ *
+ * P(x) = 0x04C11DB7
+ * P'(x) = 0xEDB88320
+ *
+ * CRC-32C (Castagnoli) polynomials:
+ *
+ * P(x) = 0x1EDC6F41
+ * P'(x) = 0x82F63B78
+ */
+
+static unsigned long constants_CRC_32_LE[] = {
+ 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */
+ 0x1c6e41596, 0x154442bd4, /* R2, R1 */
+ 0x0ccaa009e, 0x1751997d0, /* R4, R3 */
+ 0x0, 0x163cd6124, /* R5 */
+ 0x0, 0x1f7011641, /* u' */
+ 0x0, 0x1db710641 /* P'(x) << 1 */
+};
+
+static unsigned long constants_CRC_32C_LE[] = {
+ 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */
+ 0x09e4addf8, 0x740eef02, /* R2, R1 */
+ 0x14cd00bd6, 0xf20c0dfe, /* R4, R3 */
+ 0x0, 0x0dd45aab8, /* R5 */
+ 0x0, 0x0dea713f1, /* u' */
+ 0x0, 0x105ec76f0 /* P'(x) << 1 */
+};
+
+/**
+ * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers
+ * @crc: Initial CRC value, typically ~0.
+ * @buf: Input buffer pointer, performance might be improved if the
+ * buffer is on a doubleword boundary.
+ * @size: Size of the buffer, must be 64 bytes or greater.
+ * @constants: CRC-32 constant pool base pointer.
+ *
+ * Register usage:
+ * V0: Initial CRC value and intermediate constants and results.
+ * V1..V4: Data for CRC computation.
+ * V5..V8: Next data chunks that are fetched from the input buffer.
+ * V9: Constant for BE->LE conversion and shift operations
+ * V10..V14: CRC-32 constants.
+ */
+static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants)
+{
+ /* Load CRC-32 constants */
+ fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants);
+
+ /*
+ * Load the initial CRC value.
+ *
+ * The CRC value is loaded into the rightmost word of the
+ * vector register and is later XORed with the LSB portion
+ * of the loaded input data.
+ */
+ fpu_vzero(0); /* Clear V0 */
+ fpu_vlvgf(0, crc, 3); /* Load CRC into rightmost word */
+
+ /* Load a 64-byte data chunk and XOR with CRC */
+ fpu_vlm(1, 4, buf);
+ fpu_vperm(1, 1, 1, CONST_PERM_LE2BE);
+ fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
+ fpu_vperm(3, 3, 3, CONST_PERM_LE2BE);
+ fpu_vperm(4, 4, 4, CONST_PERM_LE2BE);
+
+ fpu_vx(1, 0, 1); /* V1 ^= CRC */
+ buf += 64;
+ size -= 64;
+
+ while (size >= 64) {
+ fpu_vlm(5, 8, buf);
+ fpu_vperm(5, 5, 5, CONST_PERM_LE2BE);
+ fpu_vperm(6, 6, 6, CONST_PERM_LE2BE);
+ fpu_vperm(7, 7, 7, CONST_PERM_LE2BE);
+ fpu_vperm(8, 8, 8, CONST_PERM_LE2BE);
+ /*
+ * Perform a GF(2) multiplication of the doublewords in V1 with
+ * the R1 and R2 reduction constants in V0. The intermediate
+ * result is then folded (accumulated) with the next data chunk
+ * in V5 and stored in V1. Repeat this step for the register
+ * contents in V2, V3, and V4 respectively.
+ */
+ fpu_vgfmag(1, CONST_R2R1, 1, 5);
+ fpu_vgfmag(2, CONST_R2R1, 2, 6);
+ fpu_vgfmag(3, CONST_R2R1, 3, 7);
+ fpu_vgfmag(4, CONST_R2R1, 4, 8);
+ buf += 64;
+ size -= 64;
+ }
+
+ /*
+ * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
+ * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+ * value remains.
+ */
+ fpu_vgfmag(1, CONST_R4R3, 1, 2);
+ fpu_vgfmag(1, CONST_R4R3, 1, 3);
+ fpu_vgfmag(1, CONST_R4R3, 1, 4);
+
+ while (size >= 16) {
+ fpu_vl(2, buf);
+ fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
+ fpu_vgfmag(1, CONST_R4R3, 1, 2);
+ buf += 16;
+ size -= 16;
+ }
+
+ /*
+ * Set up a vector register for byte shifts. The shift value must
+ * be loaded in bits 1-4 in byte element 7 of a vector register.
+ * Shift by 8 bytes: 0x40
+ * Shift by 4 bytes: 0x20
+ */
+ fpu_vleib(9, 0x40, 7);
+
+ /*
+ * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+ * to move R4 into the rightmost doubleword and set the leftmost
+ * doubleword to 0x1.
+ */
+ fpu_vsrlb(0, CONST_R4R3, 9);
+ fpu_vleig(0, 1, 0);
+
+ /*
+ * Compute GF(2) product of V1 and V0. The rightmost doubleword
+ * of V1 is multiplied with R4. The leftmost doubleword of V1 is
+ * multiplied by 0x1 and is then XORed with rightmost product.
+ * Implicitly, the intermediate leftmost product becomes padded
+ */
+ fpu_vgfmg(1, 0, 1);
+
+ /*
+ * Now do the final 32-bit fold by multiplying the rightmost word
+ * in V1 with R5 and XOR the result with the remaining bits in V1.
+ *
+ * To achieve this by a single VGFMAG, right shift V1 by a word
+ * and store the result in V2 which is then accumulated. Use the
+ * vector unpack instruction to load the rightmost half of the
+ * doubleword into the rightmost doubleword element of V1; the other
+ * half is loaded in the leftmost doubleword.
+ * The vector register with CONST_R5 contains the R5 constant in the
+ * rightmost doubleword and the leftmost doubleword is zero to ignore
+ * the leftmost product of V1.
+ */
+ fpu_vleib(9, 0x20, 7); /* Shift by words */
+ fpu_vsrlb(2, 1, 9); /* Store remaining bits in V2 */
+ fpu_vupllf(1, 1); /* Split rightmost doubleword */
+ fpu_vgfmag(1, CONST_R5, 1, 2); /* V1 = (V1 * R5) XOR V2 */
+
+ /*
+ * Apply a Barret reduction to compute the final 32-bit CRC value.
+ *
+ * The input values to the Barret reduction are the degree-63 polynomial
+ * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+ * constant u. The Barret reduction result is the CRC value of R(x) mod
+ * P(x).
+ *
+ * The Barret reduction algorithm is defined as:
+ *
+ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+ * 3. C(x) = R(x) XOR T2(x) mod x^32
+ *
+ * Note: The leftmost doubleword of vector register containing
+ * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+ * is zero and does not contribute to the final result.
+ */
+
+ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+ fpu_vupllf(2, 1);
+ fpu_vgfmg(2, CONST_RU_POLY, 2);
+
+ /*
+ * Compute the GF(2) product of the CRC polynomial with T1(x) in
+ * V2 and XOR the intermediate result, T2(x), with the value in V1.
+ * The final result is stored in word element 2 of V2.
+ */
+ fpu_vupllf(2, 2);
+ fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
+
+ return fpu_vlgvf(2, 2);
+}
+
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+ return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]);
+}
+
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+ return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]);
+}
diff --git a/lib/crc/sparc/crc32.h b/lib/crc/sparc/crc32.h
new file mode 100644
index 000000000000..df7c350acd7b
--- /dev/null
+++ b/lib/crc/sparc/crc32.h
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* CRC32c (Castagnoli), sparc64 crc32c opcode accelerated
+ *
+ * This is based largely upon arch/x86/crypto/crc32c-intel.c
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Authors: Austin Zhang <austin_zhang@linux.intel.com>
+ * Kent Liu <kent.liu@intel.com>
+ */
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32c_opcode);
+
+#define crc32_le_arch crc32_le_base /* not implemented on this arch */
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+void crc32c_sparc64(u32 *crcp, const u64 *data, size_t len);
+
+static inline u32 crc32c_arch(u32 crc, const u8 *data, size_t len)
+{
+ size_t n = -(uintptr_t)data & 7;
+
+ if (!static_branch_likely(&have_crc32c_opcode))
+ return crc32c_base(crc, data, len);
+
+ if (n) {
+ /* Data isn't 8-byte aligned. Align it. */
+ n = min(n, len);
+ crc = crc32c_base(crc, data, n);
+ data += n;
+ len -= n;
+ }
+ n = len & ~7U;
+ if (n) {
+ crc32c_sparc64(&crc, (const u64 *)data, n);
+ data += n;
+ len -= n;
+ }
+ if (len)
+ crc = crc32c_base(crc, data, len);
+ return crc;
+}
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static void crc32_mod_init_arch(void)
+{
+ unsigned long cfr;
+
+ if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+ return;
+
+ __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+ if (!(cfr & CFR_CRC32C))
+ return;
+
+ static_branch_enable(&have_crc32c_opcode);
+ pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n");
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (static_key_enabled(&have_crc32c_opcode))
+ return CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/sparc/crc32c_asm.S b/lib/crc/sparc/crc32c_asm.S
new file mode 100644
index 000000000000..4db873850f44
--- /dev/null
+++ b/lib/crc/sparc/crc32c_asm.S
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+#include <asm/asi.h>
+
+ENTRY(crc32c_sparc64)
+ /* %o0=crc32p, %o1=data_ptr, %o2=len */
+ VISEntryHalf
+ lda [%o0] ASI_PL, %f1
+1: ldd [%o1], %f2
+ CRC32C(0,2,0)
+ subcc %o2, 8, %o2
+ bne,pt %icc, 1b
+ add %o1, 0x8, %o1
+ sta %f1, [%o0] ASI_PL
+ VISExitHalf
+2: retl
+ nop
+ENDPROC(crc32c_sparc64)
diff --git a/lib/crc/tests/Makefile b/lib/crc/tests/Makefile
new file mode 100644
index 000000000000..65f63c318ef5
--- /dev/null
+++ b/lib/crc/tests/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o
diff --git a/lib/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c
index 064c2d581557..9a450e25ac81 100644
--- a/lib/tests/crc_kunit.c
+++ b/lib/crc/tests/crc_kunit.c
@@ -6,6 +6,7 @@
*
* Author: Eric Biggers <ebiggers@google.com>
*/
+#include <kunit/run-in-irq-context.h>
#include <kunit/test.h>
#include <linux/crc7.h>
#include <linux/crc16.h>
@@ -36,14 +37,12 @@ static size_t test_buflen;
* can fit any CRC up to CRC-64. The CRC is passed in, and is expected
* to be returned in, the least significant bits of the u64. The
* function is expected to *not* invert the CRC at the beginning and end.
- * @combine_func: Optional function to combine two CRCs.
*/
struct crc_variant {
int bits;
bool le;
u64 poly;
u64 (*func)(u64 crc, const u8 *p, size_t len);
- u64 (*combine_func)(u64 crc1, u64 crc2, size_t len2);
};
static u32 rand32(void)
@@ -143,15 +142,62 @@ static size_t generate_random_length(size_t max_length)
return len % (max_length + 1);
}
+#define IRQ_TEST_DATA_LEN 512
+#define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */
+
+struct crc_irq_test_state {
+ const struct crc_variant *v;
+ u64 initial_crc;
+ u64 expected_crcs[IRQ_TEST_NUM_BUFFERS];
+ atomic_t seqno;
+};
+
+/*
+ * Compute the CRC of one of the test messages and verify that it matches the
+ * expected CRC from @state->expected_crcs. To increase the chance of detecting
+ * problems, cycle through multiple messages.
+ */
+static bool crc_irq_test_func(void *state_)
+{
+ struct crc_irq_test_state *state = state_;
+ const struct crc_variant *v = state->v;
+ u32 i = (u32)atomic_inc_return(&state->seqno) % IRQ_TEST_NUM_BUFFERS;
+ u64 actual_crc = v->func(state->initial_crc,
+ &test_buffer[i * IRQ_TEST_DATA_LEN],
+ IRQ_TEST_DATA_LEN);
+
+ return actual_crc == state->expected_crcs[i];
+}
+
+/*
+ * Test that if CRCs are computed in task, softirq, and hardirq context
+ * concurrently, then all results are as expected.
+ */
+static void crc_interrupt_context_test(struct kunit *test,
+ const struct crc_variant *v)
+{
+ struct crc_irq_test_state state = {
+ .v = v,
+ .initial_crc = generate_random_initial_crc(v),
+ };
+
+ for (int i = 0; i < IRQ_TEST_NUM_BUFFERS; i++) {
+ state.expected_crcs[i] = crc_ref(
+ v, state.initial_crc,
+ &test_buffer[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN);
+ }
+
+ kunit_run_irq_test(test, crc_irq_test_func, 100000, &state);
+}
+
/* Test that v->func gives the same CRCs as a reference implementation. */
-static void crc_main_test(struct kunit *test, const struct crc_variant *v)
+static void crc_test(struct kunit *test, const struct crc_variant *v)
{
size_t i;
for (i = 0; i < CRC_KUNIT_NUM_TEST_ITERS; i++) {
u64 init_crc, expected_crc, actual_crc;
size_t len, offset;
- bool nosimd;
init_crc = generate_random_initial_crc(v);
len = generate_random_length(CRC_KUNIT_MAX_LEN);
@@ -170,51 +216,18 @@ static void crc_main_test(struct kunit *test, const struct crc_variant *v)
/* Refresh the data occasionally. */
prandom_bytes_state(&rng, &test_buffer[offset], len);
- nosimd = rand32() % 8 == 0;
-
/*
* Compute the CRC, and verify that it equals the CRC computed
* by a simple bit-at-a-time reference implementation.
*/
expected_crc = crc_ref(v, init_crc, &test_buffer[offset], len);
- if (nosimd)
- local_irq_disable();
actual_crc = v->func(init_crc, &test_buffer[offset], len);
- if (nosimd)
- local_irq_enable();
- KUNIT_EXPECT_EQ_MSG(test, expected_crc, actual_crc,
- "Wrong result with len=%zu offset=%zu nosimd=%d",
- len, offset, nosimd);
- }
-}
-
-/* Test that CRC(concat(A, B)) == combine_CRCs(CRC(A), CRC(B), len(B)). */
-static void crc_combine_test(struct kunit *test, const struct crc_variant *v)
-{
- int i;
-
- for (i = 0; i < 100; i++) {
- u64 init_crc = generate_random_initial_crc(v);
- size_t len1 = generate_random_length(CRC_KUNIT_MAX_LEN);
- size_t len2 = generate_random_length(CRC_KUNIT_MAX_LEN - len1);
- u64 crc1, crc2, expected_crc, actual_crc;
-
- prandom_bytes_state(&rng, test_buffer, len1 + len2);
- crc1 = v->func(init_crc, test_buffer, len1);
- crc2 = v->func(0, &test_buffer[len1], len2);
- expected_crc = v->func(init_crc, test_buffer, len1 + len2);
- actual_crc = v->combine_func(crc1, crc2, len2);
KUNIT_EXPECT_EQ_MSG(test, expected_crc, actual_crc,
- "CRC combination gave wrong result with len1=%zu len2=%zu\n",
- len1, len2);
+ "Wrong result with len=%zu offset=%zu",
+ len, offset);
}
-}
-static void crc_test(struct kunit *test, const struct crc_variant *v)
-{
- crc_main_test(test, v);
- if (v->combine_func)
- crc_combine_test(test, v);
+ crc_interrupt_context_test(test, v);
}
static __always_inline void
@@ -337,17 +350,11 @@ static u64 crc32_le_wrapper(u64 crc, const u8 *p, size_t len)
return crc32_le(crc, p, len);
}
-static u64 crc32_le_combine_wrapper(u64 crc1, u64 crc2, size_t len2)
-{
- return crc32_le_combine(crc1, crc2, len2);
-}
-
static const struct crc_variant crc_variant_crc32_le = {
.bits = 32,
.le = true,
.poly = 0xedb88320,
.func = crc32_le_wrapper,
- .combine_func = crc32_le_combine_wrapper,
};
static void crc32_le_test(struct kunit *test)
diff --git a/lib/crc/x86/crc-pclmul-consts.h b/lib/crc/x86/crc-pclmul-consts.h
new file mode 100644
index 000000000000..6ae94158fca2
--- /dev/null
+++ b/lib/crc/x86/crc-pclmul-consts.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * CRC constants generated by:
+ *
+ * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
+ *
+ * Do not edit manually.
+ */
+
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-16 using
+ * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct {
+ u8 bswap_mask[16];
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
+ .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+ .fold_across_2048_bits_consts = {
+ 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */
+ 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */
+ 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */
+ 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */
+ 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */
+ 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */
+ 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
+ },
+};
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ * x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct {
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
+ .fold_across_2048_bits_consts = {
+ 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */
+ 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */
+ 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */
+ 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */
+ 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */
+ 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */
+ 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */
+ },
+};
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 +
+ * x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0
+ */
+static const struct {
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc32_lsb_0x82f63b78_consts ____cacheline_aligned __maybe_unused = {
+ .fold_across_2048_bits_consts = {
+ 0x00000000dcb17aa4, /* HI64_TERMS: (x^2079 mod G) * x^32 */
+ 0x00000000b9e02b86, /* LO64_TERMS: (x^2015 mod G) * x^32 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x000000006992cea2, /* HI64_TERMS: (x^1055 mod G) * x^32 */
+ 0x000000000d3b6092, /* LO64_TERMS: (x^991 mod G) * x^32 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x00000000740eef02, /* HI64_TERMS: (x^543 mod G) * x^32 */
+ 0x000000009e4addf8, /* LO64_TERMS: (x^479 mod G) * x^32 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x000000003da6d0cb, /* HI64_TERMS: (x^287 mod G) * x^32 */
+ 0x00000000ba4fc28e, /* LO64_TERMS: (x^223 mod G) * x^32 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x00000000f20c0dfe, /* HI64_TERMS: (x^159 mod G) * x^32 */
+ 0x00000000493c7d27, /* LO64_TERMS: (x^95 mod G) * x^32 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x4869ec38dea713f1, /* HI64_TERMS: floor(x^95 / G) */
+ 0x0000000105ec76f0, /* LO64_TERMS: (G - x^32) * x^31 */
+ },
+};
+
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x^1 + x^0
+ */
+static const struct {
+ u8 bswap_mask[16];
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = {
+ .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+ .fold_across_2048_bits_consts = {
+ 0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */
+ 0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */
+ 0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */
+ 0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */
+ 0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */
+ 0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */
+ 0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */
+ },
+};
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
+ * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
+ * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
+ * x^4 + x^3 + x^0
+ */
+static const struct {
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = {
+ .fold_across_2048_bits_consts = {
+ 0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */
+ 0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */
+ 0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */
+ 0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */
+ },
+ .fold_across_256_bits_consts = {
+ 0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */
+ 0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */
+ },
+ .fold_across_128_bits_consts = {
+ 0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */
+ 0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */
+ 0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */
+ },
+};
diff --git a/lib/crc/x86/crc-pclmul-template.S b/lib/crc/x86/crc-pclmul-template.S
new file mode 100644
index 000000000000..a02f7dc8053e
--- /dev/null
+++ b/lib/crc/x86/crc-pclmul-template.S
@@ -0,0 +1,575 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+//
+// Template to generate [V]PCLMULQDQ-based CRC functions for x86
+//
+// Copyright 2025 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+
+// Offsets within the generated constants table
+.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only
+.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0
+.set OFFSETOF_SHUF_TABLE, 1*16
+.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16
+
+// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
+// corresponding non-VEX instruction plus any needed moves. The supported
+// instruction formats are:
+//
+// - Two-arg [src, dst], where the non-VEX format is the same.
+// - Three-arg [src1, src2, dst] where the non-VEX format is
+// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too.
+//
+// \insn gives the instruction without a "v" prefix and including any immediate
+// argument if needed to make the instruction follow one of the above formats.
+// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
+// it first; this is needed when \arg1 is an unaligned mem operand.
+.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
+.if AVX_LEVEL == 0
+ // VEX not allowed. Emulate it.
+ .ifnb \arg3 // Three-arg [src1, src2, dst]
+ .ifc "\arg2", "\arg3" // src2 == dst?
+ .ifnb \unaligned_mem_tmp
+ movdqu \arg1, \unaligned_mem_tmp
+ \insn \unaligned_mem_tmp, \arg3
+ .else
+ \insn \arg1, \arg3
+ .endif
+ .else // src2 != dst
+ .ifc "\arg1", "\arg3"
+ .error "Can't have src1 == dst when src2 != dst"
+ .endif
+ .ifnb \unaligned_mem_tmp
+ movdqu \arg1, \unaligned_mem_tmp
+ movdqa \arg2, \arg3
+ \insn \unaligned_mem_tmp, \arg3
+ .else
+ movdqa \arg2, \arg3
+ \insn \arg1, \arg3
+ .endif
+ .endif
+ .else // Two-arg [src, dst]
+ .ifnb \unaligned_mem_tmp
+ movdqu \arg1, \unaligned_mem_tmp
+ \insn \unaligned_mem_tmp, \arg2
+ .else
+ \insn \arg1, \arg2
+ .endif
+ .endif
+.else
+ // VEX is allowed. Emit the desired instruction directly.
+ .ifnb \arg3
+ v\insn \arg1, \arg2, \arg3
+ .else
+ v\insn \arg1, \arg2
+ .endif
+.endif
+.endm
+
+// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
+// register of length VL.
+.macro _vbroadcast src, dst
+.if VL == 16
+ _cond_vex movdqa, \src, \dst
+.elseif VL == 32
+ vbroadcasti128 \src, \dst
+.else
+ vbroadcasti32x4 \src, \dst
+.endif
+.endm
+
+// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
+// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
+.macro _load_data vl, src, bswap_mask, dst
+.if \vl < 64
+ _cond_vex movdqu, "\src", \dst
+.else
+ vmovdqu8 \src, \dst
+.endif
+.if !LSB_CRC
+ _cond_vex pshufb, \bswap_mask, \dst, \dst
+.endif
+.endm
+
+.macro _prepare_v0 vl, v0, v1, bswap_mask
+.if LSB_CRC
+ .if \vl < 64
+ _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1
+ .else
+ vpxorq (BUF), \v0, \v0
+ .endif
+.else
+ _load_data \vl, (BUF), \bswap_mask, \v1
+ .if \vl < 64
+ _cond_vex pxor, \v1, \v0, \v0
+ .else
+ vpxorq \v1, \v0, \v0
+ .endif
+.endif
+.endm
+
+// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
+// msb-first order or the physically high qword for lsb-first order
+#define LO64_TERMS 0
+
+// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
+// qword for msb-first order or the physically low qword for lsb-first order
+#define HI64_TERMS 1
+
+// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
+// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
+.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst
+ _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
+ \src1, \src2, \dst
+.endm
+
+// Fold \acc into \data and store the result back into \acc. \data can be an
+// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
+// byte-reflection is needed; otherwise it must be a vector register. \consts
+// is a vector register containing the needed fold constants, and \tmp is a
+// temporary vector register. All arguments must be the same length.
+.macro _fold_vec acc, data, consts, tmp
+ _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
+ _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc
+.if AVX_LEVEL <= 2
+ _cond_vex pxor, \data, \tmp, \tmp
+ _cond_vex pxor, \tmp, \acc, \acc
+.else
+ vpternlogq $0x96, \data, \tmp, \acc
+.endif
+.endm
+
+// Fold \acc into \data and store the result back into \acc. \data is an
+// unaligned mem operand, \consts is a vector register containing the needed
+// fold constants, \bswap_mask is a vector register containing the
+// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
+// temporary vector registers. All arguments must have length \vl.
+.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2
+.if AVX_LEVEL == 0 || !LSB_CRC
+ _load_data \vl, \data, \bswap_mask, \tmp1
+ _fold_vec \acc, \tmp1, \consts, \tmp2
+.else
+ _fold_vec \acc, \data, \consts, \tmp1
+.endif
+.endm
+
+// Load the constants for folding across 2**i vectors of length VL at a time
+// into all 128-bit lanes of the vector register CONSTS.
+.macro _load_vec_folding_consts i
+ _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
+ CONSTS
+.endm
+
+// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
+// the result back into \v0. If the remaining length mod \vl is nonzero, also
+// fold \vl data bytes from BUF. For both operations the fold distance is \vl.
+// \consts must be a register of length \vl containing the fold constants.
+.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2
+ _fold_vec \v0, \v1, \consts, \tmp1
+ test $\vl, LEN8
+ jz .Lfold_vec_final_done\@
+ _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
+ add $\vl, BUF
+.Lfold_vec_final_done\@:
+.endm
+
+// This macro generates the body of a CRC function with the following prototype:
+//
+// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
+//
+// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
+// |buf| is the data to checksum. |len| is the data length in bytes, which must
+// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts
+// field of the constants struct that was generated for the chosen CRC variant.
+//
+// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
+// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If
+// the file is compiled in i386 mode, then the maximum supported value is 32.
+//
+// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
+// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0
+// if the CRC processes the most significant bit of each byte first, i.e. maps
+// bit0 to x^0, bit1 to x^1, bit7 to x^7.
+//
+// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
+//
+// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
+// 512 for AVX512.
+//
+// If \vl == 16 && \avx_level == 0, the generated code requires:
+// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
+//
+// If \vl == 32 && \avx_level == 2, the generated code requires:
+// VPCLMULQDQ && AVX2.
+//
+// If \vl == 64 && \avx_level == 512, the generated code requires:
+// VPCLMULQDQ && AVX512BW && AVX512VL.
+//
+// Other \vl and \avx_level combinations are either not supported or not useful.
+.macro _crc_pclmul n, lsb_crc, vl, avx_level
+ .set LSB_CRC, \lsb_crc
+ .set VL, \vl
+ .set AVX_LEVEL, \avx_level
+
+ // Define aliases for the xmm, ymm, or zmm registers according to VL.
+.irp i, 0,1,2,3,4,5,6,7
+ .if VL == 16
+ .set V\i, %xmm\i
+ .set LOG2_VL, 4
+ .elseif VL == 32
+ .set V\i, %ymm\i
+ .set LOG2_VL, 5
+ .elseif VL == 64
+ .set V\i, %zmm\i
+ .set LOG2_VL, 6
+ .else
+ .error "Unsupported vector length"
+ .endif
+.endr
+ // Define aliases for the function parameters.
+ // Note: when crc_t is shorter than u32, zero-extension to 32 bits is
+ // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed
+ // when crc_t is shorter than u64.
+#ifdef __x86_64__
+.if \n <= 32
+ .set CRC, %edi
+.else
+ .set CRC, %rdi
+.endif
+ .set BUF, %rsi
+ .set LEN, %rdx
+ .set LEN32, %edx
+ .set LEN8, %dl
+ .set CONSTS_PTR, %rcx
+#else
+ // 32-bit support, assuming -mregparm=3 and not including support for
+ // CRC-64 (which would use both eax and edx to pass the crc parameter).
+ .set CRC, %eax
+ .set BUF, %edx
+ .set LEN, %ecx
+ .set LEN32, %ecx
+ .set LEN8, %cl
+ .set CONSTS_PTR, %ebx // Passed on stack
+#endif
+
+ // Define aliases for some local variables. V0-V5 are used without
+ // aliases (for accumulators, data, temporary values, etc). Staying
+ // within the first 8 vector registers keeps the code 32-bit SSE
+ // compatible and reduces the size of 64-bit SSE code slightly.
+ .set BSWAP_MASK, V6
+ .set BSWAP_MASK_YMM, %ymm6
+ .set BSWAP_MASK_XMM, %xmm6
+ .set CONSTS, V7
+ .set CONSTS_YMM, %ymm7
+ .set CONSTS_XMM, %xmm7
+
+ // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
+ // functions generated by this macro are called only by static_call.
+ ANNOTATE_NOENDBR
+
+#ifdef __i386__
+ push CONSTS_PTR
+ mov 8(%esp), CONSTS_PTR
+#endif
+
+ // Create a 128-bit vector that contains the initial CRC in the end
+ // representing the high-order polynomial coefficients, and the rest 0.
+ // If the CRC is msb-first, also load the byte-reflection table.
+.if \n <= 32
+ _cond_vex movd, CRC, %xmm0
+.else
+ _cond_vex movq, CRC, %xmm0
+.endif
+.if !LSB_CRC
+ _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
+ _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
+.endif
+
+ // Load the first vector of data and XOR the initial CRC into the
+ // appropriate end of the first 128-bit lane of data. If LEN < VL, then
+ // use a short vector and jump ahead to the final reduction. (LEN >= 16
+ // is guaranteed here but not necessarily LEN >= VL.)
+.if VL >= 32
+ cmp $VL, LEN
+ jae .Lat_least_1vec\@
+ .if VL == 64
+ cmp $32, LEN32
+ jb .Lless_than_32bytes\@
+ _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM
+ add $32, BUF
+ jmp .Lreduce_256bits_to_128bits\@
+.Lless_than_32bytes\@:
+ .endif
+ _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM
+ add $16, BUF
+ vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+ jmp .Lcheck_for_partial_block\@
+.Lat_least_1vec\@:
+.endif
+ _prepare_v0 VL, V0, V1, BSWAP_MASK
+
+ // Handle VL <= LEN < 4*VL.
+ cmp $4*VL-1, LEN
+ ja .Lat_least_4vecs\@
+ add $VL, BUF
+ // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
+ // If VL==16 then load fold_across_128_bits_consts first, as the final
+ // reduction depends on it and it won't be loaded anywhere else.
+ cmp $2*VL-1, LEN32
+.if VL == 16
+ _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+.endif
+ jbe .Lreduce_1vec_to_128bits\@
+ // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to
+ // the reduction from 2 vectors.
+ _load_data VL, (BUF), BSWAP_MASK, V1
+ add $VL, BUF
+ jmp .Lreduce_2vecs_to_1\@
+
+.Lat_least_4vecs\@:
+ // Load 3 more vectors of data.
+ _load_data VL, 1*VL(BUF), BSWAP_MASK, V1
+ _load_data VL, 2*VL(BUF), BSWAP_MASK, V2
+ _load_data VL, 3*VL(BUF), BSWAP_MASK, V3
+ sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32
+ add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32
+
+ // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
+ // 4 vectors of data and write the result back to V0-V3.
+ cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32
+ jbe .Lreduce_4vecs_to_2\@
+ _load_vec_folding_consts 2
+.Lfold_4vecs_loop\@:
+ _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ sub $-4*VL, BUF
+ add $-4*VL, LEN
+ cmp $4*VL-1, LEN
+ ja .Lfold_4vecs_loop\@
+
+ // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold
+ // two more vectors of data from BUF, if at least that much remains.
+.Lreduce_4vecs_to_2\@:
+ _load_vec_folding_consts 1
+ _fold_vec V0, V2, CONSTS, V4
+ _fold_vec V1, V3, CONSTS, V4
+ test $2*VL, LEN8
+ jz .Lreduce_2vecs_to_1\@
+ _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ sub $-2*VL, BUF
+
+ // Fold V0 into V1 and write the result back to V0. Then fold one more
+ // vector of data from BUF, if at least that much remains.
+.Lreduce_2vecs_to_1\@:
+ _load_vec_folding_consts 0
+ _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
+
+.Lreduce_1vec_to_128bits\@:
+.if VL == 64
+ // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of
+ // data from BUF, if at least that much remains.
+ vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
+ vextracti64x4 $1, %zmm0, %ymm1
+ _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
+.Lreduce_256bits_to_128bits\@:
+.endif
+.if VL >= 32
+ // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of
+ // data from BUF, if at least that much remains.
+ vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+ vextracti128 $1, %ymm0, %xmm1
+ _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
+.Lcheck_for_partial_block\@:
+.endif
+ and $15, LEN32
+ jz .Lreduce_128bits_to_crc\@
+
+ // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now
+ // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
+ // and B is the polynomial of the remaining LEN data bytes. To reduce
+ // this to 128 bits without needing fold constants for each possible
+ // LEN, rearrange this expression into C1*(x^128) + C2, where
+ // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
+ // Then fold C1 into C2, which is just another fold across 128 bits.
+
+.if !LSB_CRC || AVX_LEVEL == 0
+ // Load the last 16 data bytes. Note that originally LEN was >= 16.
+ _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
+.endif // Else will use vpblendvb mem operand later.
+.if !LSB_CRC
+ neg LEN // Needed for indexing shuf_table
+.endif
+
+ // tmp = A*x^(8*LEN) mod x^128
+ // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
+ // i.e. right-shift by LEN bytes.
+ // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
+ // i.e. left-shift by LEN bytes.
+ _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
+ _cond_vex pshufb, %xmm3, %xmm0, %xmm1
+
+ // C1 = floor(A / x^(128 - 8*LEN))
+ // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
+ // i.e. left-shift by 16-LEN bytes.
+ // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
+ // i.e. right-shift by 16-LEN bytes.
+ _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
+ %xmm0, %xmm0, unaligned_mem_tmp=%xmm4
+
+ // C2 = tmp + B. This is just a blend of tmp with the last 16 data
+ // bytes (reflected if msb-first). The blend mask is the shuffle table
+ // that was used to create tmp. 0 selects tmp, and 1 last16databytes.
+.if AVX_LEVEL == 0
+ movdqa %xmm0, %xmm4
+ movdqa %xmm3, %xmm0
+ pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand
+ movdqa %xmm4, %xmm0
+.elseif LSB_CRC
+ vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1
+.else
+ vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+.endif
+
+ // Fold C1 into C2 and store the 128-bit result in %xmm0.
+ _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4
+
+.Lreduce_128bits_to_crc\@:
+ // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit
+ // polynomial stored in %xmm0 (using either lsb-first or msb-first bit
+ // order according to LSB_CRC), and G is the CRC's generator polynomial.
+
+ // First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
+ //
+ // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
+ // x^n * (%xmm0 mod x^64)
+ //
+ // Store t0 * x^(64-n) in %xmm0. I.e., actually do:
+ //
+ // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
+ // x^64 * (%xmm0 mod x^64)
+ //
+ // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
+ // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
+ // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
+ // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
+ // (considering the extra factor of x that gets implicitly introduced by
+ // each pclmulqdq when using lsb-first order), is identical to the
+ // constant that was used earlier for folding the LO64_TERMS across 128
+ // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM.
+ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
+.if LSB_CRC
+ _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
+.else
+ _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
+.endif
+ _cond_vex pxor, %xmm1, %xmm0, %xmm0
+ // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
+ // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
+
+ // First step of Barrett reduction: Compute floor(t0 / G). This is the
+ // polynomial by which G needs to be multiplied to cancel out the x^n
+ // and higher terms of t0, i.e. to reduce t0 mod G. First do:
+ //
+ // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
+ //
+ // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in
+ // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
+ // value that makes enough precision be carried through the calculation.
+ //
+ // The '* x' makes it so the result is floor(t1 / x^64) rather than
+ // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
+ // can be extracted much more easily in the next step. In the lsb-first
+ // case the '* x' happens implicitly. In the msb-first case it must be
+ // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
+ // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
+ // the multiplication by the x^64 term is handled using a pxor. The
+ // pxor causes the low 64 terms of t1 to be wrong, but they are unused.
+ _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
+ _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
+.if !LSB_CRC
+ _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
+.endif
+ // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
+
+ // Second step of Barrett reduction: Cancel out the x^n and higher terms
+ // of t0 by subtracting the needed multiple of G. This gives the CRC:
+ //
+ // crc := t0 - (G * floor(t0 / G))
+ //
+ // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
+ //
+ // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
+ //
+ // Furthermore, since the resulting CRC is n-bit, if mod x^n is
+ // explicitly applied to it then the x^n term of G makes no difference
+ // in the result and can be omitted. This helps keep the constant
+ // multiplier in 64 bits in most cases. This gives the following:
+ //
+ // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
+ // crc := (%xmm0 / x^(64-n)) mod x^n
+ //
+ // In the lsb-first case, each pclmulqdq implicitly introduces
+ // an extra factor of x, so in that case the constant that needs to be
+ // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
+ // For lsb-first CRCs where n=64, the extra factor of x cannot be as
+ // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to
+ // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC
+ // polynomials have nonzero x^n and x^0 terms.) It works out as: the
+ // CRC has be XORed with the physically low qword of %xmm1, representing
+ // floor(t0 / G). The most efficient way to do that is to move it to
+ // the physically high qword and use a ternlog to combine the two XORs.
+.if LSB_CRC && \n == 64
+ _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2
+ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
+ .if AVX_LEVEL <= 2
+ _cond_vex pxor, %xmm2, %xmm0, %xmm0
+ _cond_vex pxor, %xmm1, %xmm0, %xmm0
+ .else
+ vpternlogq $0x96, %xmm2, %xmm1, %xmm0
+ .endif
+ _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64
+.else
+ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
+ _cond_vex pxor, %xmm1, %xmm0, %xmm0
+ .if \n == 8
+ _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
+ .elseif \n == 16
+ _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
+ .elseif \n == 32
+ _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
+ .else // \n == 64 && !LSB_CRC
+ _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64
+ .endif
+.endif
+
+.if VL > 16
+ vzeroupper // Needed when ymm or zmm registers may have been used.
+.endif
+#ifdef __i386__
+ pop CONSTS_PTR
+#endif
+ RET
+.endm
+
+#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
+SYM_FUNC_START(prefix##_pclmul_sse); \
+ _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
+SYM_FUNC_END(prefix##_pclmul_sse); \
+ \
+SYM_FUNC_START(prefix##_vpclmul_avx2); \
+ _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \
+SYM_FUNC_END(prefix##_vpclmul_avx2); \
+ \
+SYM_FUNC_START(prefix##_vpclmul_avx512); \
+ _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \
+SYM_FUNC_END(prefix##_vpclmul_avx512);
diff --git a/lib/crc/x86/crc-pclmul-template.h b/lib/crc/x86/crc-pclmul-template.h
new file mode 100644
index 000000000000..02744831c6fa
--- /dev/null
+++ b/lib/crc/x86/crc-pclmul-template.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
+ * instantiated by crc-pclmul-template.S
+ *
+ * Copyright 2025 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+#ifndef _CRC_PCLMUL_TEMPLATE_H
+#define _CRC_PCLMUL_TEMPLATE_H
+
+#include <asm/cpufeatures.h>
+#include <asm/simd.h>
+#include <linux/static_call.h>
+#include "crc-pclmul-consts.h"
+
+#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \
+crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \
+ const void *consts_ptr); \
+crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \
+ const void *consts_ptr); \
+crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \
+ const void *consts_ptr); \
+DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
+
+static inline bool have_vpclmul(void)
+{
+ return boot_cpu_has(X86_FEATURE_VPCLMULQDQ) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL);
+}
+
+static inline bool have_avx512(void)
+{
+ return boot_cpu_has(X86_FEATURE_AVX512BW) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ !boot_cpu_has(X86_FEATURE_PREFER_YMM) &&
+ cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL);
+}
+
+/*
+ * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
+ * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
+ *
+ * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
+ * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
+ * varying by CPU and factors such as which parts of the "FPU" state userspace
+ * has touched, which could result in a larger cutoff being better. Indeed, a
+ * larger cutoff is usually better for a *single* message. However, the
+ * overhead of the FPU section gets amortized if multiple FPU sections get
+ * executed before returning to userspace, since the XSAVE and XRSTOR occur only
+ * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on
+ * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
+ */
+#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \
+do { \
+ if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \
+ likely(irq_fpu_usable())) { \
+ const void *consts_ptr; \
+ \
+ consts_ptr = (consts).fold_across_128_bits_consts; \
+ kernel_fpu_begin(); \
+ crc = static_call(prefix##_pclmul)((crc), (p), (len), \
+ consts_ptr); \
+ kernel_fpu_end(); \
+ return crc; \
+ } \
+} while (0)
+
+#endif /* _CRC_PCLMUL_TEMPLATE_H */
diff --git a/lib/crc/x86/crc-t10dif.h b/lib/crc/x86/crc-t10dif.h
new file mode 100644
index 000000000000..8ee8824da551
--- /dev/null
+++ b/lib/crc/x86/crc-t10dif.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CRC-T10DIF using [V]PCLMULQDQ instructions
+ *
+ * Copyright 2024 Google LLC
+ */
+
+#include "crc-pclmul-template.h"
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
+
+static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts,
+ have_pclmulqdq);
+ return crc_t10dif_generic(crc, p, len);
+}
+
+#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
+static void crc_t10dif_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ static_branch_enable(&have_pclmulqdq);
+ if (have_vpclmul()) {
+ if (have_avx512())
+ static_call_update(crc16_msb_pclmul,
+ crc16_msb_vpclmul_avx512);
+ else
+ static_call_update(crc16_msb_pclmul,
+ crc16_msb_vpclmul_avx2);
+ }
+ }
+}
diff --git a/lib/crc/x86/crc16-msb-pclmul.S b/lib/crc/x86/crc16-msb-pclmul.S
new file mode 100644
index 000000000000..e9fe248093a8
--- /dev/null
+++ b/lib/crc/x86/crc16-msb-pclmul.S
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)
diff --git a/lib/crc/x86/crc32-pclmul.S b/lib/crc/x86/crc32-pclmul.S
new file mode 100644
index 000000000000..f20f40fb0172
--- /dev/null
+++ b/lib/crc/x86/crc32-pclmul.S
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h
new file mode 100644
index 000000000000..19a5e3c6c73b
--- /dev/null
+++ b/lib/crc/x86/crc32.h
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86-optimized CRC32 functions
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Copyright 2012 Xyratex Technology Limited
+ * Copyright 2024 Google LLC
+ */
+
+#include "crc-pclmul-template.h"
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
+ have_pclmulqdq);
+ return crc32_le_base(crc, p, len);
+}
+
+#ifdef CONFIG_X86_64
+#define CRC32_INST "crc32q %1, %q0"
+#else
+#define CRC32_INST "crc32l %1, %0"
+#endif
+
+/*
+ * Use carryless multiply version of crc32c when buffer size is >= 512 to
+ * account for FPU state save/restore overhead.
+ */
+#define CRC32C_PCLMUL_BREAKEVEN 512
+
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ size_t num_longs;
+
+ if (!static_branch_likely(&have_crc32))
+ return crc32c_base(crc, p, len);
+
+ if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
+ static_branch_likely(&have_pclmulqdq) && likely(irq_fpu_usable())) {
+ /*
+ * Long length, the vector registers are usable, and the CPU is
+ * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
+ * It is worthwhile to divide the data into multiple streams,
+ * CRC them independently, and combine them using PCLMULQDQ.
+ * crc32c_x86_3way() does this using 3 streams, which is the
+ * most that x86_64 CPUs have traditionally been capable of.
+ *
+ * However, due to improved VPCLMULQDQ performance on newer
+ * CPUs, use crc32_lsb_vpclmul_avx512() instead of
+ * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
+ * "good" implementation of AVX-512.
+ *
+ * Future work: the optimal strategy on Zen 3--5 is actually to
+ * use both crc32q and VPCLMULQDQ in parallel. Unfortunately,
+ * different numbers of streams and vector lengths are optimal
+ * on each CPU microarchitecture, making it challenging to take
+ * advantage of this. (Zen 5 even supports 7 parallel crc32q, a
+ * major upgrade.) For now, just choose between
+ * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter
+ * is needed anyway for crc32_le(), so we just reuse it here.
+ */
+ kernel_fpu_begin();
+ if (static_branch_likely(&have_vpclmul_avx512))
+ crc = crc32_lsb_vpclmul_avx512(crc, p, len,
+ crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
+ else
+ crc = crc32c_x86_3way(crc, p, len);
+ kernel_fpu_end();
+ return crc;
+ }
+
+ /*
+ * Short length, XMM registers unusable, or the CPU is 32-bit; but the
+ * CPU supports CRC32 instructions. Just issue a single stream of CRC32
+ * instructions inline. While this doesn't use the CPU's CRC32
+ * throughput very well, it avoids the need to combine streams. Stream
+ * combination would be inefficient here.
+ */
+
+ for (num_longs = len / sizeof(unsigned long);
+ num_longs != 0; num_longs--, p += sizeof(unsigned long))
+ asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
+
+ if (sizeof(unsigned long) > 4 && (len & 4)) {
+ asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
+ p += 4;
+ }
+ if (len & 2) {
+ asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
+ p += 2;
+ }
+ if (len & 1)
+ asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
+
+ return crc;
+}
+
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static void crc32_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_XMM4_2))
+ static_branch_enable(&have_crc32);
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ static_branch_enable(&have_pclmulqdq);
+ if (have_vpclmul()) {
+ if (have_avx512()) {
+ static_call_update(crc32_lsb_pclmul,
+ crc32_lsb_vpclmul_avx512);
+ static_branch_enable(&have_vpclmul_avx512);
+ } else {
+ static_call_update(crc32_lsb_pclmul,
+ crc32_lsb_vpclmul_avx2);
+ }
+ }
+ }
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ u32 optimizations = 0;
+
+ if (static_key_enabled(&have_crc32))
+ optimizations |= CRC32C_OPTIMIZATION;
+ if (static_key_enabled(&have_pclmulqdq))
+ optimizations |= CRC32_LE_OPTIMIZATION;
+ return optimizations;
+}
diff --git a/lib/crc/x86/crc32c-3way.S b/lib/crc/x86/crc32c-3way.S
new file mode 100644
index 000000000000..9b8770503bbc
--- /dev/null
+++ b/lib/crc/x86/crc32c-3way.S
@@ -0,0 +1,360 @@
+/*
+ * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
+ *
+ * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
+ * downloaded from:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ * Copyright 2024 Google LLC
+ *
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * James Guilford <james.guilford@intel.com>
+ * David Cote <david.m.cote@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/linkage.h>
+
+## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+
+# Define threshold below which buffers are considered "small" and routed to
+# regular CRC code that does not interleave the CRC instructions.
+#define SMALL_SIZE 200
+
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
+
+.text
+SYM_FUNC_START(crc32c_x86_3way)
+#define crc0 %edi
+#define crc0_q %rdi
+#define bufp %rsi
+#define bufp_d %esi
+#define len %rdx
+#define len_dw %edx
+#define n_misaligned %ecx /* overlaps chunk_bytes! */
+#define n_misaligned_q %rcx
+#define chunk_bytes %ecx /* overlaps n_misaligned! */
+#define chunk_bytes_q %rcx
+#define crc1 %r8
+#define crc2 %r9
+
+ cmp $SMALL_SIZE, len
+ jb .Lsmall
+
+ ################################################################
+ ## 1) ALIGN:
+ ################################################################
+ mov bufp_d, n_misaligned
+ neg n_misaligned
+ and $7, n_misaligned # calculate the misalignment amount of
+ # the address
+ je .Laligned # Skip if aligned
+
+ # Process 1 <= n_misaligned <= 7 bytes individually in order to align
+ # the remaining data to an 8-byte boundary.
+.Ldo_align:
+ movq (bufp), %rax
+ add n_misaligned_q, bufp
+ sub n_misaligned_q, len
+.Lalign_loop:
+ crc32b %al, crc0 # compute crc32 of 1-byte
+ shr $8, %rax # get next byte
+ dec n_misaligned
+ jne .Lalign_loop
+.Laligned:
+
+ ################################################################
+ ## 2) PROCESS BLOCK:
+ ################################################################
+
+ cmp $128*24, len
+ jae .Lfull_block
+
+.Lpartial_block:
+ # Compute floor(len / 24) to get num qwords to process from each lane.
+ imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
+ shr $16, %eax
+ jmp .Lcrc_3lanes
+
+.Lfull_block:
+ # Processing 128 qwords from each lane.
+ mov $128, %eax
+
+ ################################################################
+ ## 3) CRC each of three lanes:
+ ################################################################
+
+.Lcrc_3lanes:
+ xor crc1,crc1
+ xor crc2,crc2
+ mov %eax, chunk_bytes
+ shl $3, chunk_bytes # num bytes to process from each lane
+ sub $5, %eax # 4 for 4x_loop, 1 for special last iter
+ jl .Lcrc_3lanes_4x_done
+
+ # Unroll the loop by a factor of 4 to reduce the overhead of the loop
+ # bookkeeping instructions, which can compete with crc32q for the ALUs.
+.Lcrc_3lanes_4x_loop:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ crc32q 8(bufp), crc0_q
+ crc32q 8(bufp,chunk_bytes_q), crc1
+ crc32q 8(bufp,chunk_bytes_q,2), crc2
+ crc32q 16(bufp), crc0_q
+ crc32q 16(bufp,chunk_bytes_q), crc1
+ crc32q 16(bufp,chunk_bytes_q,2), crc2
+ crc32q 24(bufp), crc0_q
+ crc32q 24(bufp,chunk_bytes_q), crc1
+ crc32q 24(bufp,chunk_bytes_q,2), crc2
+ add $32, bufp
+ sub $4, %eax
+ jge .Lcrc_3lanes_4x_loop
+
+.Lcrc_3lanes_4x_done:
+ add $4, %eax
+ jz .Lcrc_3lanes_last_qword
+
+.Lcrc_3lanes_1x_loop:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ add $8, bufp
+ dec %eax
+ jnz .Lcrc_3lanes_1x_loop
+
+.Lcrc_3lanes_last_qword:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
+
+ ################################################################
+ ## 4) Combine three results:
+ ################################################################
+
+ lea (K_table-8)(%rip), %rax # first entry is for idx 1
+ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
+ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
+ sub %rax, len # len -= chunk_bytes * 3
+
+ movq crc0_q, %xmm1 # CRC for block 1
+ pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
+
+ movq crc1, %xmm2 # CRC for block 2
+ pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
+
+ pxor %xmm2,%xmm1
+ movq %xmm1, %rax
+ xor (bufp,chunk_bytes_q,2), %rax
+ mov crc2, crc0_q
+ crc32 %rax, crc0_q
+ lea 8(bufp,chunk_bytes_q,2), bufp
+
+ ################################################################
+ ## 5) If more blocks remain, goto (2):
+ ################################################################
+
+ cmp $128*24, len
+ jae .Lfull_block
+ cmp $SMALL_SIZE, len
+ jae .Lpartial_block
+
+ #######################################################################
+ ## 6) Process any remainder without interleaving:
+ #######################################################################
+.Lsmall:
+ test len_dw, len_dw
+ jz .Ldone
+ mov len_dw, %eax
+ shr $3, %eax
+ jz .Ldo_dword
+.Ldo_qwords:
+ crc32q (bufp), crc0_q
+ add $8, bufp
+ dec %eax
+ jnz .Ldo_qwords
+.Ldo_dword:
+ test $4, len_dw
+ jz .Ldo_word
+ crc32l (bufp), crc0
+ add $4, bufp
+.Ldo_word:
+ test $2, len_dw
+ jz .Ldo_byte
+ crc32w (bufp), crc0
+ add $2, bufp
+.Ldo_byte:
+ test $1, len_dw
+ jz .Ldone
+ crc32b (bufp), crc0
+.Ldone:
+ mov crc0, %eax
+ RET
+SYM_FUNC_END(crc32c_x86_3way)
+
+.section .rodata, "a", @progbits
+ ################################################################
+ ## PCLMULQDQ tables
+ ## Table is 128 entries x 2 words (8 bytes) each
+ ################################################################
+.align 8
+K_table:
+ .long 0x493c7d27, 0x00000001
+ .long 0xba4fc28e, 0x493c7d27
+ .long 0xddc0152b, 0xf20c0dfe
+ .long 0x9e4addf8, 0xba4fc28e
+ .long 0x39d3b296, 0x3da6d0cb
+ .long 0x0715ce53, 0xddc0152b
+ .long 0x47db8317, 0x1c291d04
+ .long 0x0d3b6092, 0x9e4addf8
+ .long 0xc96cfdc0, 0x740eef02
+ .long 0x878a92a7, 0x39d3b296
+ .long 0xdaece73e, 0x083a6eec
+ .long 0xab7aff2a, 0x0715ce53
+ .long 0x2162d385, 0xc49f4f67
+ .long 0x83348832, 0x47db8317
+ .long 0x299847d5, 0x2ad91c30
+ .long 0xb9e02b86, 0x0d3b6092
+ .long 0x18b33a4e, 0x6992cea2
+ .long 0xb6dd949b, 0xc96cfdc0
+ .long 0x78d9ccb7, 0x7e908048
+ .long 0xbac2fd7b, 0x878a92a7
+ .long 0xa60ce07b, 0x1b3d8f29
+ .long 0xce7f39f4, 0xdaece73e
+ .long 0x61d82e56, 0xf1d0f55e
+ .long 0xd270f1a2, 0xab7aff2a
+ .long 0xc619809d, 0xa87ab8a8
+ .long 0x2b3cac5d, 0x2162d385
+ .long 0x65863b64, 0x8462d800
+ .long 0x1b03397f, 0x83348832
+ .long 0xebb883bd, 0x71d111a8
+ .long 0xb3e32c28, 0x299847d5
+ .long 0x064f7f26, 0xffd852c6
+ .long 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf285651c, 0xdcb17aa4
+ .long 0x10746f3c, 0x18b33a4e
+ .long 0xc7a68855, 0xf37c5aee
+ .long 0x271d9844, 0xb6dd949b
+ .long 0x8e766a0c, 0x6051d5a2
+ .long 0x93a5f730, 0x78d9ccb7
+ .long 0x6cb08e5c, 0x18b0d4ff
+ .long 0x6b749fb2, 0xbac2fd7b
+ .long 0x1393e203, 0x21f3d99c
+ .long 0xcec3662e, 0xa60ce07b
+ .long 0x96c515bb, 0x8f158014
+ .long 0xe6fc4e6a, 0xce7f39f4
+ .long 0x8227bb8a, 0xa00457f7
+ .long 0xb0cd4768, 0x61d82e56
+ .long 0x39c7ff35, 0x8d6d2c43
+ .long 0xd7a4825c, 0xd270f1a2
+ .long 0x0ab3844b, 0x00ac29cf
+ .long 0x0167d312, 0xc619809d
+ .long 0xf6076544, 0xe9adf796
+ .long 0x26f6a60a, 0x2b3cac5d
+ .long 0xa741c1bf, 0x96638b34
+ .long 0x98d8d9cb, 0x65863b64
+ .long 0x49c3cc9c, 0xe0e9f351
+ .long 0x68bce87a, 0x1b03397f
+ .long 0x57a3d037, 0x9af01f2d
+ .long 0x6956fc3b, 0xebb883bd
+ .long 0x42d98888, 0x2cff42cf
+ .long 0x3771e98f, 0xb3e32c28
+ .long 0xb42ae3d9, 0x88f25a3a
+ .long 0x2178513a, 0x064f7f26
+ .long 0xe0ac139e, 0x4e36f0b0
+ .long 0x170076fa, 0xdd7e3b0c
+ .long 0x444dd413, 0xbd6f81f8
+ .long 0x6f345e45, 0xf285651c
+ .long 0x41d17b64, 0x91c9bd4b
+ .long 0xff0dba97, 0x10746f3c
+ .long 0xa2b73df1, 0x885f087b
+ .long 0xf872e54c, 0xc7a68855
+ .long 0x1e41e9fc, 0x4c144932
+ .long 0x86d8e4d2, 0x271d9844
+ .long 0x651bd98b, 0x52148f02
+ .long 0x5bb8f1bc, 0x8e766a0c
+ .long 0xa90fd27a, 0xa3c6f37a
+ .long 0xb3af077a, 0x93a5f730
+ .long 0x4984d782, 0xd7c0557f
+ .long 0xca6ef3ac, 0x6cb08e5c
+ .long 0x234e0b26, 0x63ded06a
+ .long 0xdd66cbbb, 0x6b749fb2
+ .long 0x4597456a, 0x4d56973c
+ .long 0xe9e28eb4, 0x1393e203
+ .long 0x7b3ff57a, 0x9669c9df
+ .long 0xc9c8b782, 0xcec3662e
+ .long 0x3f70cc6f, 0xe417f38a
+ .long 0x93e106a4, 0x96c515bb
+ .long 0x62ec6c6d, 0x4b9e0f71
+ .long 0xd813b325, 0xe6fc4e6a
+ .long 0x0df04680, 0xd104b8fc
+ .long 0x2342001e, 0x8227bb8a
+ .long 0x0a2a8d7e, 0x5b397730
+ .long 0x6d9a4957, 0xb0cd4768
+ .long 0xe8b6368b, 0xe78eb416
+ .long 0xd2c3ed1a, 0x39c7ff35
+ .long 0x995a5724, 0x61ff0e01
+ .long 0x9ef68d35, 0xd7a4825c
+ .long 0x0c139b31, 0x8d96551c
+ .long 0xf2271e60, 0x0ab3844b
+ .long 0x0b0bf8ca, 0x0bf80dd2
+ .long 0x2664fd8b, 0x0167d312
+ .long 0xed64812d, 0x8821abed
+ .long 0x02ee03b2, 0xf6076544
+ .long 0x8604ae0f, 0x6a45d2b2
+ .long 0x363bd6b3, 0x26f6a60a
+ .long 0x135c83fd, 0xd8d26619
+ .long 0x5fabe670, 0xa741c1bf
+ .long 0x35ec3279, 0xde87806c
+ .long 0x00bcf5f6, 0x98d8d9cb
+ .long 0x8ae00689, 0x14338754
+ .long 0x17f27698, 0x49c3cc9c
+ .long 0x58ca5f00, 0x5bd2011f
+ .long 0xaa7c7ad5, 0x68bce87a
+ .long 0xb5cfca28, 0xdd07448e
+ .long 0xded288f8, 0x57a3d037
+ .long 0x59f229bc, 0xdde8f5b9
+ .long 0x6d390dec, 0x6956fc3b
+ .long 0x37170390, 0xa3e3e02c
+ .long 0x6353c1cc, 0x42d98888
+ .long 0xc4584f5c, 0xd73c7bea
+ .long 0xf48642e9, 0x3771e98f
+ .long 0x531377e2, 0x80ff0093
+ .long 0xdd35bc8d, 0xb42ae3d9
+ .long 0xb25b29f2, 0x8fe4c34d
+ .long 0x9a5ede41, 0x2178513a
+ .long 0xa563905d, 0xdf99fc11
+ .long 0x45cddf4e, 0xe0ac139e
+ .long 0xacfa3103, 0x6c23e841
+ .long 0xa51b6135, 0x170076fa
diff --git a/lib/crc/x86/crc64-pclmul.S b/lib/crc/x86/crc64-pclmul.S
new file mode 100644
index 000000000000..4173051b5197
--- /dev/null
+++ b/lib/crc/x86/crc64-pclmul.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0)
+DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1)
diff --git a/lib/crc/x86/crc64.h b/lib/crc/x86/crc64.h
new file mode 100644
index 000000000000..7d4599319343
--- /dev/null
+++ b/lib/crc/x86/crc64.h
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CRC64 using [V]PCLMULQDQ instructions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-pclmul-template.h"
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64);
+DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64);
+
+static inline u64 crc64_be_arch(u64 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts,
+ have_pclmulqdq);
+ return crc64_be_generic(crc, p, len);
+}
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts,
+ have_pclmulqdq);
+ return crc64_nvme_generic(crc, p, len);
+}
+
+#define crc64_mod_init_arch crc64_mod_init_arch
+static void crc64_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ static_branch_enable(&have_pclmulqdq);
+ if (have_vpclmul()) {
+ if (have_avx512()) {
+ static_call_update(crc64_msb_pclmul,
+ crc64_msb_vpclmul_avx512);
+ static_call_update(crc64_lsb_pclmul,
+ crc64_lsb_vpclmul_avx512);
+ } else {
+ static_call_update(crc64_msb_pclmul,
+ crc64_msb_vpclmul_avx2);
+ static_call_update(crc64_lsb_pclmul,
+ crc64_lsb_vpclmul_avx2);
+ }
+ }
+ }
+}
diff --git a/lib/crc32.c b/lib/crc32.c
deleted file mode 100644
index 95429861d3ac..000000000000
--- a/lib/crc32.c
+++ /dev/null
@@ -1,126 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
- * cleaned up code to current version of sparse and added the slicing-by-8
- * algorithm to the closely similar existing slicing-by-4 algorithm.
- *
- * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
- * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks!
- * Code was from the public domain, copyright abandoned. Code was
- * subsequently included in the kernel, thus was re-licensed under the
- * GNU GPL v2.
- *
- * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
- * Same crc32 function was used in 5 other places in the kernel.
- * I made one version, and deleted the others.
- * There are various incantations of crc32(). Some use a seed of 0 or ~0.
- * Some xor at the end with ~0. The generic crc32() function takes
- * seed as an argument, and doesn't xor at the end. Then individual
- * users can do whatever they need.
- * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
- * fs/jffs2 uses seed 0, doesn't xor with ~0.
- * fs/partitions/efi.c uses seed ~0, xor's with ~0.
- */
-
-/* see: Documentation/staging/crc32.rst for a description of algorithms */
-
-#include <linux/crc32.h>
-#include <linux/crc32poly.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include "crc32table.h"
-
-MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>");
-MODULE_DESCRIPTION("Various CRC32 calculations");
-MODULE_LICENSE("GPL");
-
-u32 crc32_le_base(u32 crc, const u8 *p, size_t len)
-{
- while (len--)
- crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++];
- return crc;
-}
-EXPORT_SYMBOL(crc32_le_base);
-
-u32 crc32c_base(u32 crc, const u8 *p, size_t len)
-{
- while (len--)
- crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++];
- return crc;
-}
-EXPORT_SYMBOL(crc32c_base);
-
-/*
- * This multiplies the polynomials x and y modulo the given modulus.
- * This follows the "little-endian" CRC convention that the lsbit
- * represents the highest power of x, and the msbit represents x^0.
- */
-static u32 gf2_multiply(u32 x, u32 y, u32 modulus)
-{
- u32 product = x & 1 ? y : 0;
- int i;
-
- for (i = 0; i < 31; i++) {
- product = (product >> 1) ^ (product & 1 ? modulus : 0);
- x >>= 1;
- product ^= x & 1 ? y : 0;
- }
-
- return product;
-}
-
-/**
- * crc32_generic_shift - Append @len 0 bytes to crc, in logarithmic time
- * @crc: The original little-endian CRC (i.e. lsbit is x^31 coefficient)
- * @len: The number of bytes. @crc is multiplied by x^(8*@len)
- * @polynomial: The modulus used to reduce the result to 32 bits.
- *
- * It's possible to parallelize CRC computations by computing a CRC
- * over separate ranges of a buffer, then summing them.
- * This shifts the given CRC by 8*len bits (i.e. produces the same effect
- * as appending len bytes of zero to the data), in time proportional
- * to log(len).
- */
-static u32 crc32_generic_shift(u32 crc, size_t len, u32 polynomial)
-{
- u32 power = polynomial; /* CRC of x^32 */
- int i;
-
- /* Shift up to 32 bits in the simple linear way */
- for (i = 0; i < 8 * (int)(len & 3); i++)
- crc = (crc >> 1) ^ (crc & 1 ? polynomial : 0);
-
- len >>= 2;
- if (!len)
- return crc;
-
- for (;;) {
- /* "power" is x^(2^i), modulo the polynomial */
- if (len & 1)
- crc = gf2_multiply(crc, power, polynomial);
-
- len >>= 1;
- if (!len)
- break;
-
- /* Square power, advancing to x^(2^(i+1)) */
- power = gf2_multiply(power, power, polynomial);
- }
-
- return crc;
-}
-
-u32 crc32_le_shift(u32 crc, size_t len)
-{
- return crc32_generic_shift(crc, len, CRC32_POLY_LE);
-}
-EXPORT_SYMBOL(crc32_le_shift);
-
-u32 crc32_be_base(u32 crc, const u8 *p, size_t len)
-{
- while (len--)
- crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++];
- return crc;
-}
-EXPORT_SYMBOL(crc32_be_base);
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 1ec1466108cc..a3647352bff6 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -2,6 +2,9 @@
menu "Crypto library routines"
+config CRYPTO_HASH_INFO
+ bool
+
config CRYPTO_LIB_UTILS
tristate
@@ -25,108 +28,124 @@ config CRYPTO_LIB_ARC4
config CRYPTO_LIB_GF128MUL
tristate
-config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
- bool
+config CRYPTO_LIB_BLAKE2B
+ tristate
help
- Declares whether the architecture provides an arch-specific
- accelerated implementation of the Blake2s library interface,
- either builtin or as a module.
+ The BLAKE2b library functions. Select this if your module uses any of
+ the functions from <crypto/blake2b.h>.
-config CRYPTO_LIB_BLAKE2S_GENERIC
- def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
- help
- This symbol can be depended upon by arch implementations of the
- Blake2s library interface that require the generic code as a
- fallback, e.g., for SIMD implementations. If no arch specific
- implementation is enabled, this implementation serves the users
- of CRYPTO_LIB_BLAKE2S.
+config CRYPTO_LIB_BLAKE2B_ARCH
+ bool
+ depends on CRYPTO_LIB_BLAKE2B && !UML
+ default y if ARM && KERNEL_MODE_NEON
-config CRYPTO_ARCH_HAVE_LIB_CHACHA
+# BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option.
+
+config CRYPTO_LIB_BLAKE2S_ARCH
bool
- help
- Declares whether the architecture provides an arch-specific
- accelerated implementation of the ChaCha library interface,
- either builtin or as a module.
+ depends on !UML
+ default y if ARM
+ default y if X86_64
-config CRYPTO_LIB_CHACHA_GENERIC
+config CRYPTO_LIB_CHACHA
tristate
- default CRYPTO_LIB_CHACHA if !CRYPTO_ARCH_HAVE_LIB_CHACHA
select CRYPTO_LIB_UTILS
help
- This symbol can be selected by arch implementations of the ChaCha
- library interface that require the generic code as a fallback, e.g.,
- for SIMD implementations. If no arch specific implementation is
- enabled, this implementation serves the users of CRYPTO_LIB_CHACHA.
+ Enable the ChaCha library interface. Select this if your module uses
+ chacha_crypt() or hchacha_block().
-config CRYPTO_LIB_CHACHA
+config CRYPTO_LIB_CHACHA_ARCH
+ bool
+ depends on CRYPTO_LIB_CHACHA && !UML && !KMSAN
+ default y if ARM
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if MIPS && CPU_MIPS32_R2
+ default y if PPC64 && CPU_LITTLE_ENDIAN && VSX
+ default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if S390
+ default y if X86_64
+
+config CRYPTO_LIB_CURVE25519
tristate
+ select CRYPTO_LIB_UTILS
help
- Enable the ChaCha library interface. This interface may be fulfilled
- by either the generic implementation or an arch-specific one, if one
- is available and enabled.
+ The Curve25519 library functions. Select this if your module uses any
+ of the functions from <crypto/curve25519.h>.
-config CRYPTO_ARCH_HAVE_LIB_CURVE25519
+config CRYPTO_LIB_CURVE25519_ARCH
bool
- help
- Declares whether the architecture provides an arch-specific
- accelerated implementation of the Curve25519 library interface,
- either builtin or as a module.
+ depends on CRYPTO_LIB_CURVE25519 && !UML && !KMSAN
+ default y if ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
+ default y if PPC64 && CPU_LITTLE_ENDIAN
+ default y if X86_64
config CRYPTO_LIB_CURVE25519_GENERIC
- tristate
- select CRYPTO_LIB_UTILS
- help
- This symbol can be depended upon by arch implementations of the
- Curve25519 library interface that require the generic code as a
- fallback, e.g., for SIMD implementations. If no arch specific
- implementation is enabled, this implementation serves the users
- of CRYPTO_LIB_CURVE25519.
+ bool
+ depends on CRYPTO_LIB_CURVE25519
+ default y if !CRYPTO_LIB_CURVE25519_ARCH || ARM || X86_64
-config CRYPTO_LIB_CURVE25519_INTERNAL
+config CRYPTO_LIB_DES
tristate
- select CRYPTO_LIB_CURVE25519_GENERIC if CRYPTO_ARCH_HAVE_LIB_CURVE25519=n
-config CRYPTO_LIB_CURVE25519
+config CRYPTO_LIB_MD5
tristate
- select CRYPTO
- select CRYPTO_LIB_CURVE25519_INTERNAL
help
- Enable the Curve25519 library interface. This interface may be
- fulfilled by either the generic implementation or an arch-specific
- one, if one is available and enabled.
+ The MD5 and HMAC-MD5 library functions. Select this if your module
+ uses any of the functions from <crypto/md5.h>.
-config CRYPTO_LIB_DES
+config CRYPTO_LIB_MD5_ARCH
+ bool
+ depends on CRYPTO_LIB_MD5 && !UML
+ default y if MIPS && CPU_CAVIUM_OCTEON
+ default y if PPC
+ default y if SPARC64
+
+config CRYPTO_LIB_POLY1305
tristate
+ help
+ The Poly1305 library functions. Select this if your module uses any
+ of the functions from <crypto/poly1305.h>.
+
+config CRYPTO_LIB_POLY1305_ARCH
+ bool
+ depends on CRYPTO_LIB_POLY1305 && !UML && !KMSAN
+ default y if ARM
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if MIPS
+ # The PPC64 code needs to be fixed to work in softirq context.
+ default y if PPC64 && CPU_LITTLE_ENDIAN && VSX && BROKEN
+ default y if RISCV
+ default y if X86_64
+
+# This symbol controls the inclusion of the Poly1305 generic code. This differs
+# from most of the other algorithms, which handle the generic code
+# "automatically" via __maybe_unused. This is needed so that the Adiantum code,
+# which calls the poly1305_core_*() functions directly, can enable them.
+config CRYPTO_LIB_POLY1305_GENERIC
+ bool
+ depends on CRYPTO_LIB_POLY1305
+ # Enable if there's no arch impl or the arch impl requires the generic
+ # impl as a fallback. (Or if selected explicitly.)
+ default y if !CRYPTO_LIB_POLY1305_ARCH || PPC64
config CRYPTO_LIB_POLY1305_RSIZE
int
- default 2 if MIPS
+ default 2 if MIPS || RISCV
default 11 if X86_64
default 9 if ARM || ARM64
default 1
-config CRYPTO_ARCH_HAVE_LIB_POLY1305
- bool
- help
- Declares whether the architecture provides an arch-specific
- accelerated implementation of the Poly1305 library interface,
- either builtin or as a module.
-
-config CRYPTO_LIB_POLY1305_GENERIC
+config CRYPTO_LIB_POLYVAL
tristate
- default CRYPTO_LIB_POLY1305 if !CRYPTO_ARCH_HAVE_LIB_POLY1305
help
- This symbol can be selected by arch implementations of the Poly1305
- library interface that require the generic code as a fallback, e.g.,
- for SIMD implementations. If no arch specific implementation is
- enabled, this implementation serves the users of CRYPTO_LIB_POLY1305.
+ The POLYVAL library functions. Select this if your module uses any of
+ the functions from <crypto/polyval.h>.
-config CRYPTO_LIB_POLY1305
- tristate
- help
- Enable the Poly1305 library interface. This interface may be fulfilled
- by either the generic implementation or an arch-specific one, if one
- is available and enabled.
+config CRYPTO_LIB_POLYVAL_ARCH
+ bool
+ depends on CRYPTO_LIB_POLYVAL && !UML
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if X86_64
config CRYPTO_LIB_CHACHA20POLY1305
tristate
@@ -136,65 +155,74 @@ config CRYPTO_LIB_CHACHA20POLY1305
config CRYPTO_LIB_SHA1
tristate
+ help
+ The SHA-1 and HMAC-SHA1 library functions. Select this if your module
+ uses any of the functions from <crypto/sha1.h>.
+
+config CRYPTO_LIB_SHA1_ARCH
+ bool
+ depends on CRYPTO_LIB_SHA1 && !UML
+ default y if ARM
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if MIPS && CPU_CAVIUM_OCTEON
+ default y if PPC
+ default y if S390
+ default y if SPARC64
+ default y if X86_64
config CRYPTO_LIB_SHA256
tristate
help
- Enable the SHA-256 library interface. This interface may be fulfilled
- by either the generic implementation or an arch-specific one, if one
- is available and enabled.
+ The SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions.
+ Select this if your module uses any of these functions from
+ <crypto/sha2.h>.
-config CRYPTO_ARCH_HAVE_LIB_SHA256
+config CRYPTO_LIB_SHA256_ARCH
bool
+ depends on CRYPTO_LIB_SHA256 && !UML
+ default y if ARM && !CPU_V7M
+ default y if ARM64
+ default y if MIPS && CPU_CAVIUM_OCTEON
+ default y if PPC && SPE
+ default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if S390
+ default y if SPARC64
+ default y if X86_64
+
+config CRYPTO_LIB_SHA512
+ tristate
help
- Declares whether the architecture provides an arch-specific
- accelerated implementation of the SHA-256 library interface.
+ The SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions.
+ Select this if your module uses any of these functions from
+ <crypto/sha2.h>.
-config CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
+config CRYPTO_LIB_SHA512_ARCH
bool
- help
- Declares whether the architecture provides an arch-specific
- accelerated implementation of the SHA-256 library interface
- that is SIMD-based and therefore not usable in hardirq
- context.
+ depends on CRYPTO_LIB_SHA512 && !UML
+ default y if ARM && !CPU_V7M
+ default y if ARM64
+ default y if MIPS && CPU_CAVIUM_OCTEON
+ default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if S390
+ default y if SPARC64
+ default y if X86_64
-config CRYPTO_LIB_SHA256_GENERIC
+config CRYPTO_LIB_SHA3
tristate
- default CRYPTO_LIB_SHA256 if !CRYPTO_ARCH_HAVE_LIB_SHA256
+ select CRYPTO_LIB_UTILS
help
- This symbol can be selected by arch implementations of the SHA-256
- library interface that require the generic code as a fallback, e.g.,
- for SIMD implementations. If no arch specific implementation is
- enabled, this implementation serves the users of CRYPTO_LIB_SHA256.
+ The SHA3 library functions. Select this if your module uses any of
+ the functions from <crypto/sha3.h>.
+
+config CRYPTO_LIB_SHA3_ARCH
+ bool
+ depends on CRYPTO_LIB_SHA3 && !UML
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if S390
config CRYPTO_LIB_SM3
tristate
-if !KMSAN # avoid false positives from assembly
-if ARM
-source "arch/arm/lib/crypto/Kconfig"
-endif
-if ARM64
-source "arch/arm64/lib/crypto/Kconfig"
-endif
-if MIPS
-source "arch/mips/lib/crypto/Kconfig"
-endif
-if PPC
-source "arch/powerpc/lib/crypto/Kconfig"
-endif
-if RISCV
-source "arch/riscv/lib/crypto/Kconfig"
-endif
-if S390
-source "arch/s390/lib/crypto/Kconfig"
-endif
-if SPARC
-source "arch/sparc/lib/crypto/Kconfig"
-endif
-if X86
-source "arch/x86/lib/crypto/Kconfig"
-endif
-endif
+source "lib/crypto/tests/Kconfig"
endmenu
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 3e79283b617d..b5346cebbb55 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -1,12 +1,20 @@
# SPDX-License-Identifier: GPL-2.0
+aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1
+
+quiet_cmd_perlasm = PERLASM $@
+ cmd_perlasm = $(PERL) $(<) > $(@)
+
+quiet_cmd_perlasm_with_args = PERLASM $@
+ cmd_perlasm_with_args = $(PERL) $(<) void $(@)
+
+obj-$(CONFIG_KUNIT) += tests/
+
+obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o
+
obj-$(CONFIG_CRYPTO_LIB_UTILS) += libcryptoutils.o
libcryptoutils-y := memneq.o utils.o
-# chacha is used by the /dev/random driver which is always builtin
-obj-y += chacha.o
-obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC) += libchacha.o
-
obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o
libaes-y := aes.o
@@ -21,48 +29,282 @@ libarc4-y := arc4.o
obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B) += libblake2b.o
+libblake2b-y := blake2b.o
+CFLAGS_blake2b.o := -Wframe-larger-than=4096 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
+ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y)
+CFLAGS_blake2b.o += -I$(src)/$(SRCARCH)
+libblake2b-$(CONFIG_ARM) += arm/blake2b-neon-core.o
+endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+
+################################################################################
+
# blake2s is used by the /dev/random driver which is always builtin
-obj-y += libblake2s.o
-libblake2s-y := blake2s.o
-libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) += blake2s-generic.o
-libblake2s-$(CONFIG_CRYPTO_SELFTESTS) += blake2s-selftest.o
+obj-y += blake2s.o
+ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y)
+CFLAGS_blake2s.o += -I$(src)/$(SRCARCH)
+obj-$(CONFIG_ARM) += arm/blake2s-core.o
+obj-$(CONFIG_X86) += x86/blake2s-core.o
+endif
+
+################################################################################
+
+# chacha20_block() is used by the /dev/random driver which is always builtin
+obj-y += chacha-block-generic.o
+
+obj-$(CONFIG_CRYPTO_LIB_CHACHA) += libchacha.o
+libchacha-y := chacha.o
+
+ifeq ($(CONFIG_CRYPTO_LIB_CHACHA_ARCH),y)
+CFLAGS_chacha.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libchacha-y += arm/chacha-scalar-core.o
+libchacha-$(CONFIG_KERNEL_MODE_NEON) += arm/chacha-neon-core.o
+endif
+
+libchacha-$(CONFIG_ARM64) += arm64/chacha-neon-core.o
+
+ifeq ($(CONFIG_MIPS),y)
+libchacha-y += mips/chacha-core.o
+AFLAGS_mips/chacha-core.o += -O2 # needed to fill branch delay slots
+endif
+
+libchacha-$(CONFIG_PPC) += powerpc/chacha-p10le-8x.o
+libchacha-$(CONFIG_RISCV) += riscv/chacha-riscv64-zvkb.o
+libchacha-$(CONFIG_S390) += s390/chacha-s390.o
+libchacha-$(CONFIG_X86) += x86/chacha-ssse3-x86_64.o \
+ x86/chacha-avx2-x86_64.o \
+ x86/chacha-avx512vl-x86_64.o
+endif # CONFIG_CRYPTO_LIB_CHACHA_ARCH
+
+################################################################################
obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305) += libchacha20poly1305.o
libchacha20poly1305-y += chacha20poly1305.o
libchacha20poly1305-$(CONFIG_CRYPTO_SELFTESTS) += chacha20poly1305-selftest.o
-obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += libcurve25519-generic.o
-libcurve25519-generic-y := curve25519-fiat32.o
-libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := curve25519-hacl64.o
-libcurve25519-generic-y += curve25519-generic.o
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o
+libcurve25519-y := curve25519.o
-obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o
-libcurve25519-y += curve25519.o
-libcurve25519-$(CONFIG_CRYPTO_SELFTESTS) += curve25519-selftest.o
+# Disable GCOV in odd or sensitive code
+GCOV_PROFILE_curve25519.o := n
+
+ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y)
+libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-hacl64.o
+else
+libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-fiat32.o
+endif
+# clang versions prior to 18 may blow out the stack with KASAN
+ifeq ($(CONFIG_CC_IS_CLANG)_$(call clang-min-version, 180000),y_)
+KASAN_SANITIZE_curve25519-hacl64.o := n
+endif
+
+ifeq ($(CONFIG_CRYPTO_LIB_CURVE25519_ARCH),y)
+CFLAGS_curve25519.o += -I$(src)/$(SRCARCH)
+libcurve25519-$(CONFIG_ARM) += arm/curve25519-core.o
+libcurve25519-$(CONFIG_PPC) += powerpc/curve25519-ppc64le_asm.o
+endif
+
+################################################################################
obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o
libdes-y := des.o
-obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o
-libpoly1305-y += poly1305.o
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_MD5) += libmd5.o
+libmd5-y := md5.o
+ifeq ($(CONFIG_CRYPTO_LIB_MD5_ARCH),y)
+CFLAGS_md5.o += -I$(src)/$(SRCARCH)
+libmd5-$(CONFIG_PPC) += powerpc/md5-asm.o
+libmd5-$(CONFIG_SPARC) += sparc/md5_asm.o
+endif # CONFIG_CRYPTO_LIB_MD5_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o
+libpoly1305-y := poly1305.o
+ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y)
+libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna64.o
+else
+libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna32.o
+endif
-obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += libpoly1305-generic.o
-libpoly1305-generic-y := poly1305-donna32.o
-libpoly1305-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o
-libpoly1305-generic-y += poly1305-generic.o
+ifeq ($(CONFIG_CRYPTO_LIB_POLY1305_ARCH),y)
+CFLAGS_poly1305.o += -I$(src)/$(SRCARCH)
-obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
-libsha1-y := sha1.o
+ifeq ($(CONFIG_ARM),y)
+libpoly1305-y += arm/poly1305-core.o
+$(obj)/arm/poly1305-core.S: $(src)/arm/poly1305-armv4.pl
+ $(call cmd,perlasm)
+# massage the perlasm code a bit so we only get the NEON routine if we need it
+poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
+poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
+AFLAGS_arm/poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
+endif
-obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
-libsha256-y := sha256.o
+ifeq ($(CONFIG_ARM64),y)
+libpoly1305-y += arm64/poly1305-core.o
+$(obj)/arm64/poly1305-core.S: $(src)/arm64/poly1305-armv8.pl
+ $(call cmd,perlasm_with_args)
+endif
-obj-$(CONFIG_CRYPTO_LIB_SHA256_GENERIC) += libsha256-generic.o
-libsha256-generic-y := sha256-generic.o
+ifeq ($(CONFIG_MIPS),y)
+libpoly1305-y += mips/poly1305-core.o
+poly1305-perlasm-flavour-$(CONFIG_32BIT) := o32
+poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64
+quiet_cmd_perlasm_poly1305 = PERLASM $@
+ cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@
+# Use if_changed instead of cmd, in case the flavour changed.
+$(obj)/mips/poly1305-core.S: $(src)/mips/poly1305-mips.pl FORCE
+ $(call if_changed,perlasm_poly1305)
+targets += mips/poly1305-core.S
+endif
+
+libpoly1305-$(CONFIG_PPC) += powerpc/poly1305-p10le_64.o
+
+ifeq ($(CONFIG_RISCV),y)
+libpoly1305-y += riscv/poly1305-core.o
+poly1305-perlasm-flavour-$(CONFIG_32BIT) := 32
+poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64
+quiet_cmd_perlasm_poly1305 = PERLASM $@
+ cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@
+# Use if_changed instead of cmd, in case the flavour changed.
+$(obj)/riscv/poly1305-core.S: $(src)/riscv/poly1305-riscv.pl FORCE
+ $(call if_changed,perlasm_poly1305)
+targets += riscv/poly1305-core.S
+AFLAGS_riscv/poly1305-core.o += -Dpoly1305_init=poly1305_block_init
+endif
+
+ifeq ($(CONFIG_X86),y)
+libpoly1305-y += x86/poly1305-x86_64-cryptogams.o
+$(obj)/x86/poly1305-x86_64-cryptogams.S: $(src)/x86/poly1305-x86_64-cryptogams.pl
+ $(call cmd,perlasm)
+endif
+
+endif # CONFIG_CRYPTO_LIB_POLY1305_ARCH
+
+# clean-files must be defined unconditionally
+clean-files += arm/poly1305-core.S \
+ arm64/poly1305-core.S \
+ mips/poly1305-core.S \
+ riscv/poly1305-core.S \
+ x86/poly1305-x86_64-cryptogams.S
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL) += libpolyval.o
+libpolyval-y := polyval.o
+ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
+CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
+libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
+libpolyval-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
+endif
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
+libsha1-y := sha1.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y)
+CFLAGS_sha1.o += -I$(src)/$(SRCARCH)
+ifeq ($(CONFIG_ARM),y)
+libsha1-y += arm/sha1-armv4-large.o
+libsha1-$(CONFIG_KERNEL_MODE_NEON) += arm/sha1-armv7-neon.o \
+ arm/sha1-ce-core.o
+endif
+libsha1-$(CONFIG_ARM64) += arm64/sha1-ce-core.o
+ifeq ($(CONFIG_PPC),y)
+libsha1-y += powerpc/sha1-powerpc-asm.o
+libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o
+endif
+libsha1-$(CONFIG_SPARC) += sparc/sha1_asm.o
+libsha1-$(CONFIG_X86) += x86/sha1-ssse3-and-avx.o \
+ x86/sha1-avx2-asm.o \
+ x86/sha1-ni-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA1_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
+libsha256-y := sha256.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA256_ARCH),y)
+CFLAGS_sha256.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libsha256-y += arm/sha256-ce.o arm/sha256-core.o
+$(obj)/arm/sha256-core.S: $(src)/arm/sha256-armv4.pl
+ $(call cmd,perlasm)
+AFLAGS_arm/sha256-core.o += $(aflags-thumb2-y)
+endif
+
+ifeq ($(CONFIG_ARM64),y)
+libsha256-y += arm64/sha256-core.o
+$(obj)/arm64/sha256-core.S: $(src)/arm64/sha2-armv8.pl
+ $(call cmd,perlasm_with_args)
+libsha256-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha256-ce.o
+endif
+
+libsha256-$(CONFIG_PPC) += powerpc/sha256-spe-asm.o
+libsha256-$(CONFIG_RISCV) += riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.o
+libsha256-$(CONFIG_SPARC) += sparc/sha256_asm.o
+libsha256-$(CONFIG_X86) += x86/sha256-ssse3-asm.o \
+ x86/sha256-avx-asm.o \
+ x86/sha256-avx2-asm.o \
+ x86/sha256-ni-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA256_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA512) += libsha512.o
+libsha512-y := sha512.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA512_ARCH),y)
+CFLAGS_sha512.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libsha512-y += arm/sha512-core.o
+$(obj)/arm/sha512-core.S: $(src)/arm/sha512-armv4.pl
+ $(call cmd,perlasm)
+AFLAGS_arm/sha512-core.o += $(aflags-thumb2-y)
+endif
+
+ifeq ($(CONFIG_ARM64),y)
+libsha512-y += arm64/sha512-core.o
+$(obj)/arm64/sha512-core.S: $(src)/arm64/sha2-armv8.pl
+ $(call cmd,perlasm_with_args)
+libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o
+endif
+
+libsha512-$(CONFIG_RISCV) += riscv/sha512-riscv64-zvknhb-zvkb.o
+libsha512-$(CONFIG_SPARC) += sparc/sha512_asm.o
+libsha512-$(CONFIG_X86) += x86/sha512-ssse3-asm.o \
+ x86/sha512-avx-asm.o \
+ x86/sha512-avx2-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA3) += libsha3.o
+libsha3-y := sha3.o
+
+ifeq ($(CONFIG_CRYPTO_LIB_SHA3_ARCH),y)
+CFLAGS_sha3.o += -I$(src)/$(SRCARCH)
+libsha3-$(CONFIG_ARM64) += arm64/sha3-ce-core.o
+endif # CONFIG_CRYPTO_LIB_SHA3_ARCH
+
+################################################################################
obj-$(CONFIG_MPILIB) += mpi/
-obj-$(CONFIG_CRYPTO_SELFTESTS) += simd.o
+obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o
obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o
libsm3-y := sm3.o
+
+# clean-files must be defined unconditionally
+clean-files += arm/sha256-core.S arm/sha512-core.S
+clean-files += arm64/sha256-core.S arm64/sha512-core.S
diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c
index eafe14d021f5..b57fda3460f1 100644
--- a/lib/crypto/aes.c
+++ b/lib/crypto/aes.c
@@ -5,6 +5,7 @@
#include <crypto/aes.h>
#include <linux/crypto.h>
+#include <linux/export.h>
#include <linux/module.h>
#include <linux/unaligned.h>
diff --git a/lib/crypto/aescfb.c b/lib/crypto/aescfb.c
index 437613265e14..0f294c8cbf3c 100644
--- a/lib/crypto/aescfb.c
+++ b/lib/crypto/aescfb.c
@@ -5,11 +5,10 @@
* Copyright 2023 Google LLC
*/
-#include <linux/module.h>
-
-#include <crypto/algapi.h>
#include <crypto/aes.h>
-
+#include <crypto/algapi.h>
+#include <linux/export.h>
+#include <linux/module.h>
#include <asm/irqflags.h>
static void aescfb_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
@@ -106,11 +105,11 @@ MODULE_LICENSE("GPL");
*/
static struct {
- u8 ptext[64];
- u8 ctext[64];
+ u8 ptext[64] __nonstring;
+ u8 ctext[64] __nonstring;
- u8 key[AES_MAX_KEY_SIZE];
- u8 iv[AES_BLOCK_SIZE];
+ u8 key[AES_MAX_KEY_SIZE] __nonstring;
+ u8 iv[AES_BLOCK_SIZE] __nonstring;
int klen;
int len;
diff --git a/lib/crypto/aesgcm.c b/lib/crypto/aesgcm.c
index 277824d6b4af..ac0b2fcfd606 100644
--- a/lib/crypto/aesgcm.c
+++ b/lib/crypto/aesgcm.c
@@ -5,12 +5,11 @@
* Copyright 2022 Google LLC
*/
-#include <linux/module.h>
-
#include <crypto/algapi.h>
#include <crypto/gcm.h>
#include <crypto/ghash.h>
-
+#include <linux/export.h>
+#include <linux/module.h>
#include <asm/irqflags.h>
static void aesgcm_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
@@ -205,19 +204,19 @@ MODULE_LICENSE("GPL");
* Test code below. Vectors taken from crypto/testmgr.h
*/
-static const u8 __initconst ctext0[16] =
+static const u8 __initconst ctext0[16] __nonstring =
"\x58\xe2\xfc\xce\xfa\x7e\x30\x61"
"\x36\x7f\x1d\x57\xa4\xe7\x45\x5a";
static const u8 __initconst ptext1[16];
-static const u8 __initconst ctext1[32] =
+static const u8 __initconst ctext1[32] __nonstring =
"\x03\x88\xda\xce\x60\xb6\xa3\x92"
"\xf3\x28\xc2\xb9\x71\xb2\xfe\x78"
"\xab\x6e\x47\xd4\x2c\xec\x13\xbd"
"\xf5\x3a\x67\xb2\x12\x57\xbd\xdf";
-static const u8 __initconst ptext2[64] =
+static const u8 __initconst ptext2[64] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -227,7 +226,7 @@ static const u8 __initconst ptext2[64] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39\x1a\xaf\xd2\x55";
-static const u8 __initconst ctext2[80] =
+static const u8 __initconst ctext2[80] __nonstring =
"\x42\x83\x1e\xc2\x21\x77\x74\x24"
"\x4b\x72\x21\xb7\x84\xd0\xd4\x9c"
"\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0"
@@ -239,7 +238,7 @@ static const u8 __initconst ctext2[80] =
"\x4d\x5c\x2a\xf3\x27\xcd\x64\xa6"
"\x2c\xf3\x5a\xbd\x2b\xa6\xfa\xb4";
-static const u8 __initconst ptext3[60] =
+static const u8 __initconst ptext3[60] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -249,7 +248,7 @@ static const u8 __initconst ptext3[60] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39";
-static const u8 __initconst ctext3[76] =
+static const u8 __initconst ctext3[76] __nonstring =
"\x42\x83\x1e\xc2\x21\x77\x74\x24"
"\x4b\x72\x21\xb7\x84\xd0\xd4\x9c"
"\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0"
@@ -261,17 +260,17 @@ static const u8 __initconst ctext3[76] =
"\x5b\xc9\x4f\xbc\x32\x21\xa5\xdb"
"\x94\xfa\xe9\x5a\xe7\x12\x1a\x47";
-static const u8 __initconst ctext4[16] =
+static const u8 __initconst ctext4[16] __nonstring =
"\xcd\x33\xb2\x8a\xc7\x73\xf7\x4b"
"\xa0\x0e\xd1\xf3\x12\x57\x24\x35";
-static const u8 __initconst ctext5[32] =
+static const u8 __initconst ctext5[32] __nonstring =
"\x98\xe7\x24\x7c\x07\xf0\xfe\x41"
"\x1c\x26\x7e\x43\x84\xb0\xf6\x00"
"\x2f\xf5\x8d\x80\x03\x39\x27\xab"
"\x8e\xf4\xd4\x58\x75\x14\xf0\xfb";
-static const u8 __initconst ptext6[64] =
+static const u8 __initconst ptext6[64] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -281,7 +280,7 @@ static const u8 __initconst ptext6[64] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39\x1a\xaf\xd2\x55";
-static const u8 __initconst ctext6[80] =
+static const u8 __initconst ctext6[80] __nonstring =
"\x39\x80\xca\x0b\x3c\x00\xe8\x41"
"\xeb\x06\xfa\xc4\x87\x2a\x27\x57"
"\x85\x9e\x1c\xea\xa6\xef\xd9\x84"
@@ -293,17 +292,17 @@ static const u8 __initconst ctext6[80] =
"\x99\x24\xa7\xc8\x58\x73\x36\xbf"
"\xb1\x18\x02\x4d\xb8\x67\x4a\x14";
-static const u8 __initconst ctext7[16] =
+static const u8 __initconst ctext7[16] __nonstring =
"\x53\x0f\x8a\xfb\xc7\x45\x36\xb9"
"\xa9\x63\xb4\xf1\xc4\xcb\x73\x8b";
-static const u8 __initconst ctext8[32] =
+static const u8 __initconst ctext8[32] __nonstring =
"\xce\xa7\x40\x3d\x4d\x60\x6b\x6e"
"\x07\x4e\xc5\xd3\xba\xf3\x9d\x18"
"\xd0\xd1\xc8\xa7\x99\x99\x6b\xf0"
"\x26\x5b\x98\xb5\xd4\x8a\xb9\x19";
-static const u8 __initconst ptext9[64] =
+static const u8 __initconst ptext9[64] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -313,7 +312,7 @@ static const u8 __initconst ptext9[64] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39\x1a\xaf\xd2\x55";
-static const u8 __initconst ctext9[80] =
+static const u8 __initconst ctext9[80] __nonstring =
"\x52\x2d\xc1\xf0\x99\x56\x7d\x07"
"\xf4\x7f\x37\xa3\x2a\x84\x42\x7d"
"\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9"
@@ -325,7 +324,7 @@ static const u8 __initconst ctext9[80] =
"\xb0\x94\xda\xc5\xd9\x34\x71\xbd"
"\xec\x1a\x50\x22\x70\xe3\xcc\x6c";
-static const u8 __initconst ptext10[60] =
+static const u8 __initconst ptext10[60] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -335,7 +334,7 @@ static const u8 __initconst ptext10[60] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39";
-static const u8 __initconst ctext10[76] =
+static const u8 __initconst ctext10[76] __nonstring =
"\x52\x2d\xc1\xf0\x99\x56\x7d\x07"
"\xf4\x7f\x37\xa3\x2a\x84\x42\x7d"
"\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9"
@@ -347,7 +346,7 @@ static const u8 __initconst ctext10[76] =
"\x76\xfc\x6e\xce\x0f\x4e\x17\x68"
"\xcd\xdf\x88\x53\xbb\x2d\x55\x1b";
-static const u8 __initconst ptext11[60] =
+static const u8 __initconst ptext11[60] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -357,7 +356,7 @@ static const u8 __initconst ptext11[60] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39";
-static const u8 __initconst ctext11[76] =
+static const u8 __initconst ctext11[76] __nonstring =
"\x39\x80\xca\x0b\x3c\x00\xe8\x41"
"\xeb\x06\xfa\xc4\x87\x2a\x27\x57"
"\x85\x9e\x1c\xea\xa6\xef\xd9\x84"
@@ -369,7 +368,7 @@ static const u8 __initconst ctext11[76] =
"\x25\x19\x49\x8e\x80\xf1\x47\x8f"
"\x37\xba\x55\xbd\x6d\x27\x61\x8c";
-static const u8 __initconst ptext12[719] =
+static const u8 __initconst ptext12[719] __nonstring =
"\x42\xc1\xcc\x08\x48\x6f\x41\x3f"
"\x2f\x11\x66\x8b\x2a\x16\xf0\xe0"
"\x58\x83\xf0\xc3\x70\x14\xc0\x5b"
@@ -461,7 +460,7 @@ static const u8 __initconst ptext12[719] =
"\x59\xfa\xfa\xaa\x44\x04\x01\xa7"
"\xa4\x78\xdb\x74\x3d\x8b\xb5";
-static const u8 __initconst ctext12[735] =
+static const u8 __initconst ctext12[735] __nonstring =
"\x84\x0b\xdb\xd5\xb7\xa8\xfe\x20"
"\xbb\xb1\x12\x7f\x41\xea\xb3\xc0"
"\xa2\xb4\x37\x19\x11\x58\xb6\x0b"
@@ -559,9 +558,9 @@ static struct {
const u8 *ptext;
const u8 *ctext;
- u8 key[AES_MAX_KEY_SIZE];
- u8 iv[GCM_AES_IV_SIZE];
- u8 assoc[20];
+ u8 key[AES_MAX_KEY_SIZE] __nonstring;
+ u8 iv[GCM_AES_IV_SIZE] __nonstring;
+ u8 assoc[20] __nonstring;
int klen;
int clen;
diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c
index 838812d18216..4e950e1e66d0 100644
--- a/lib/crypto/arc4.c
+++ b/lib/crypto/arc4.c
@@ -8,6 +8,7 @@
*/
#include <crypto/arc4.h>
+#include <linux/export.h>
#include <linux/module.h>
int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
diff --git a/lib/crypto/arm/.gitignore b/lib/crypto/arm/.gitignore
new file mode 100644
index 000000000000..f6c4e8ef80da
--- /dev/null
+++ b/lib/crypto/arm/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
+sha512-core.S
diff --git a/lib/crypto/arm/blake2b-neon-core.S b/lib/crypto/arm/blake2b-neon-core.S
new file mode 100644
index 000000000000..b55c37f0b88f
--- /dev/null
+++ b/lib/crypto/arm/blake2b-neon-core.S
@@ -0,0 +1,350 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm optimized with ARM NEON instructions. On ARM
+ * processors that have NEON support but not the ARMv8 Crypto Extensions,
+ * typically this BLAKE2b implementation is much faster than the SHA-2 family
+ * and slightly faster than SHA-1.
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+ .text
+ .fpu neon
+
+ // The arguments to blake2b_compress_neon()
+ CTX .req r0
+ DATA .req r1
+ NBLOCKS .req r2
+ INC .req r3
+
+ // Pointers to the rotation tables
+ ROR24_TABLE .req r4
+ ROR16_TABLE .req r5
+
+ // The original stack pointer
+ ORIG_SP .req r6
+
+ // NEON registers which contain the message words of the current block.
+ // M_0-M_3 are occasionally used for other purposes too.
+ M_0 .req d16
+ M_1 .req d17
+ M_2 .req d18
+ M_3 .req d19
+ M_4 .req d20
+ M_5 .req d21
+ M_6 .req d22
+ M_7 .req d23
+ M_8 .req d24
+ M_9 .req d25
+ M_10 .req d26
+ M_11 .req d27
+ M_12 .req d28
+ M_13 .req d29
+ M_14 .req d30
+ M_15 .req d31
+
+ .align 4
+ // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
+ // instruction. This is the most efficient way to implement these
+ // rotation amounts with NEON. (On Cortex-A53 it's the same speed as
+ // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
+.Lror24_table:
+ .byte 3, 4, 5, 6, 7, 0, 1, 2
+.Lror16_table:
+ .byte 2, 3, 4, 5, 6, 7, 0, 1
+ // The BLAKE2b initialization vector
+.Lblake2b_IV:
+ .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
+ .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
+ .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
+ .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+
+// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
+// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
+// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
+// (M_0-M_3), so that they can be reloaded if they are used as temporary
+// registers. The macro arguments s0-s15 give the order in which the message
+// words are used in this round. 'final' is 1 if this is the final round.
+.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
+ s8, s9, s10, s11, s12, s13, s14, s15, final=0
+
+ // Mix the columns:
+ // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
+ // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
+
+ // a += b + m[blake2b_sigma[r][2*i + 0]];
+ vadd.u64 q0, q0, q2
+ vadd.u64 q1, q1, q3
+ vadd.u64 d0, d0, M_\s0
+ vadd.u64 d1, d1, M_\s2
+ vadd.u64 d2, d2, M_\s4
+ vadd.u64 d3, d3, M_\s6
+
+ // d = ror64(d ^ a, 32);
+ veor q6, q6, q0
+ veor q7, q7, q1
+ vrev64.32 q6, q6
+ vrev64.32 q7, q7
+
+ // c += d;
+ vadd.u64 q4, q4, q6
+ vadd.u64 q5, q5, q7
+
+ // b = ror64(b ^ c, 24);
+ vld1.8 {M_0}, [ROR24_TABLE, :64]
+ veor q2, q2, q4
+ veor q3, q3, q5
+ vtbl.8 d4, {d4}, M_0
+ vtbl.8 d5, {d5}, M_0
+ vtbl.8 d6, {d6}, M_0
+ vtbl.8 d7, {d7}, M_0
+
+ // a += b + m[blake2b_sigma[r][2*i + 1]];
+ //
+ // M_0 got clobbered above, so we have to reload it if any of the four
+ // message words this step needs happens to be M_0. Otherwise we don't
+ // need to reload it here, as it will just get clobbered again below.
+.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
+ vld1.8 {M_0}, [sp, :64]
+.endif
+ vadd.u64 q0, q0, q2
+ vadd.u64 q1, q1, q3
+ vadd.u64 d0, d0, M_\s1
+ vadd.u64 d1, d1, M_\s3
+ vadd.u64 d2, d2, M_\s5
+ vadd.u64 d3, d3, M_\s7
+
+ // d = ror64(d ^ a, 16);
+ vld1.8 {M_0}, [ROR16_TABLE, :64]
+ veor q6, q6, q0
+ veor q7, q7, q1
+ vtbl.8 d12, {d12}, M_0
+ vtbl.8 d13, {d13}, M_0
+ vtbl.8 d14, {d14}, M_0
+ vtbl.8 d15, {d15}, M_0
+
+ // c += d;
+ vadd.u64 q4, q4, q6
+ vadd.u64 q5, q5, q7
+
+ // b = ror64(b ^ c, 63);
+ //
+ // This rotation amount isn't a multiple of 8, so it has to be
+ // implemented using a pair of shifts, which requires temporary
+ // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
+ veor q8, q2, q4
+ veor q9, q3, q5
+ vshr.u64 q2, q8, #63
+ vshr.u64 q3, q9, #63
+ vsli.u64 q2, q8, #1
+ vsli.u64 q3, q9, #1
+ vld1.8 {q8-q9}, [sp, :256]
+
+ // Mix the diagonals:
+ // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
+ // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
+ //
+ // There are two possible ways to do this: use 'vext' instructions to
+ // shift the rows of the matrix so that the diagonals become columns,
+ // and undo it afterwards; or just use 64-bit operations on 'd'
+ // registers instead of 128-bit operations on 'q' registers. We use the
+ // latter approach, as it performs much better on Cortex-A7.
+
+ // a += b + m[blake2b_sigma[r][2*i + 0]];
+ vadd.u64 d0, d0, d5
+ vadd.u64 d1, d1, d6
+ vadd.u64 d2, d2, d7
+ vadd.u64 d3, d3, d4
+ vadd.u64 d0, d0, M_\s8
+ vadd.u64 d1, d1, M_\s10
+ vadd.u64 d2, d2, M_\s12
+ vadd.u64 d3, d3, M_\s14
+
+ // d = ror64(d ^ a, 32);
+ veor d15, d15, d0
+ veor d12, d12, d1
+ veor d13, d13, d2
+ veor d14, d14, d3
+ vrev64.32 d15, d15
+ vrev64.32 d12, d12
+ vrev64.32 d13, d13
+ vrev64.32 d14, d14
+
+ // c += d;
+ vadd.u64 d10, d10, d15
+ vadd.u64 d11, d11, d12
+ vadd.u64 d8, d8, d13
+ vadd.u64 d9, d9, d14
+
+ // b = ror64(b ^ c, 24);
+ vld1.8 {M_0}, [ROR24_TABLE, :64]
+ veor d5, d5, d10
+ veor d6, d6, d11
+ veor d7, d7, d8
+ veor d4, d4, d9
+ vtbl.8 d5, {d5}, M_0
+ vtbl.8 d6, {d6}, M_0
+ vtbl.8 d7, {d7}, M_0
+ vtbl.8 d4, {d4}, M_0
+
+ // a += b + m[blake2b_sigma[r][2*i + 1]];
+.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
+ vld1.8 {M_0}, [sp, :64]
+.endif
+ vadd.u64 d0, d0, d5
+ vadd.u64 d1, d1, d6
+ vadd.u64 d2, d2, d7
+ vadd.u64 d3, d3, d4
+ vadd.u64 d0, d0, M_\s9
+ vadd.u64 d1, d1, M_\s11
+ vadd.u64 d2, d2, M_\s13
+ vadd.u64 d3, d3, M_\s15
+
+ // d = ror64(d ^ a, 16);
+ vld1.8 {M_0}, [ROR16_TABLE, :64]
+ veor d15, d15, d0
+ veor d12, d12, d1
+ veor d13, d13, d2
+ veor d14, d14, d3
+ vtbl.8 d12, {d12}, M_0
+ vtbl.8 d13, {d13}, M_0
+ vtbl.8 d14, {d14}, M_0
+ vtbl.8 d15, {d15}, M_0
+
+ // c += d;
+ vadd.u64 d10, d10, d15
+ vadd.u64 d11, d11, d12
+ vadd.u64 d8, d8, d13
+ vadd.u64 d9, d9, d14
+
+ // b = ror64(b ^ c, 63);
+ veor d16, d4, d9
+ veor d17, d5, d10
+ veor d18, d6, d11
+ veor d19, d7, d8
+ vshr.u64 q2, q8, #63
+ vshr.u64 q3, q9, #63
+ vsli.u64 q2, q8, #1
+ vsli.u64 q3, q9, #1
+ // Reloading q8-q9 can be skipped on the final round.
+.if ! \final
+ vld1.8 {q8-q9}, [sp, :256]
+.endif
+.endm
+
+//
+// void blake2b_compress_neon(struct blake2b_ctx *ctx,
+// const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2b_ctx are used:
+// u64 h[8]; (inout)
+// u64 t[2]; (inout)
+// u64 f[2]; (in)
+//
+ .align 5
+ENTRY(blake2b_compress_neon)
+ push {r4-r10}
+
+ // Allocate a 32-byte stack buffer that is 32-byte aligned.
+ mov ORIG_SP, sp
+ sub ip, sp, #32
+ bic ip, ip, #31
+ mov sp, ip
+
+ adr ROR24_TABLE, .Lror24_table
+ adr ROR16_TABLE, .Lror16_table
+
+ mov ip, CTX
+ vld1.64 {q0-q1}, [ip]! // Load h[0..3]
+ vld1.64 {q2-q3}, [ip]! // Load h[4..7]
+.Lnext_block:
+ adr r10, .Lblake2b_IV
+ vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
+ vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
+ vmov r7, r8, d28 // Copy t[0] to (r7, r8)
+ vld1.64 {q6-q7}, [r10] // Load IV[4..7]
+ adds r7, r7, INC // Increment counter
+ bcs .Lslow_inc_ctr
+ vmov.i32 d28[0], r7
+ vst1.64 {d28}, [ip] // Update t[0]
+.Linc_ctr_done:
+
+ // Load the next message block and finish initializing the state matrix
+ // 'v'. Fortunately, there are exactly enough NEON registers to fit the
+ // entire state matrix in q0-q7 and the entire message block in q8-15.
+ //
+ // However, _blake2b_round also needs some extra registers for rotates,
+ // so we have to spill some registers. It's better to spill the message
+ // registers than the state registers, as the message doesn't change.
+ // Therefore we store a copy of the first 32 bytes of the message block
+ // (q8-q9) in an aligned buffer on the stack so that they can be
+ // reloaded when needed. (We could just reload directly from the
+ // message buffer, but it's faster to use aligned loads.)
+ vld1.8 {q8-q9}, [DATA]!
+ veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
+ vld1.8 {q10-q11}, [DATA]!
+ veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
+ vld1.8 {q12-q13}, [DATA]!
+ vst1.8 {q8-q9}, [sp, :256]
+ mov ip, CTX
+ vld1.8 {q14-q15}, [DATA]!
+
+ // Execute the rounds. Each round is provided the order in which it
+ // needs to use the message words.
+ _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+ _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+ _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+ _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+ _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+ _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+ _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+ _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+ _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+ _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
+ final=1
+
+ // Fold the final state matrix into the hash chaining value:
+ //
+ // for (i = 0; i < 8; i++)
+ // h[i] ^= v[i] ^ v[i + 8];
+ //
+ vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
+ veor q0, q0, q4 // v[0..1] ^= v[8..9]
+ veor q1, q1, q5 // v[2..3] ^= v[10..11]
+ vld1.64 {q10-q11}, [ip] // Load old h[4..7]
+ veor q2, q2, q6 // v[4..5] ^= v[12..13]
+ veor q3, q3, q7 // v[6..7] ^= v[14..15]
+ veor q0, q0, q8 // v[0..1] ^= h[0..1]
+ veor q1, q1, q9 // v[2..3] ^= h[2..3]
+ mov ip, CTX
+ subs NBLOCKS, NBLOCKS, #1 // nblocks--
+ vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
+ veor q2, q2, q10 // v[4..5] ^= h[4..5]
+ veor q3, q3, q11 // v[6..7] ^= h[6..7]
+ vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
+
+ // Advance to the next block, if there is one.
+ bne .Lnext_block // nblocks != 0?
+
+ mov sp, ORIG_SP
+ pop {r4-r10}
+ mov pc, lr
+
+.Lslow_inc_ctr:
+ // Handle the case where the counter overflowed its low 32 bits, by
+ // carrying the overflow bit into the full 128-bit counter.
+ vmov r9, r10, d29
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adc r10, r10, #0
+ vmov d28, r7, r8
+ vmov d29, r9, r10
+ vst1.64 {q14}, [ip] // Update t[0] and t[1]
+ b .Linc_ctr_done
+ENDPROC(blake2b_compress_neon)
diff --git a/lib/crypto/arm/blake2b.h b/lib/crypto/arm/blake2b.h
new file mode 100644
index 000000000000..5c76498521e6
--- /dev/null
+++ b/lib/crypto/arm/blake2b.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm, NEON accelerated
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void blake2b_compress_neon(struct blake2b_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc);
+
+static void blake2b_compress(struct blake2b_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc)
+{
+ if (!static_branch_likely(&have_neon) || !may_use_simd()) {
+ blake2b_compress_generic(ctx, data, nblocks, inc);
+ return;
+ }
+ do {
+ const size_t blocks = min_t(size_t, nblocks,
+ SZ_4K / BLAKE2B_BLOCK_SIZE);
+
+ scoped_ksimd()
+ blake2b_compress_neon(ctx, data, blocks, inc);
+
+ data += blocks * BLAKE2B_BLOCK_SIZE;
+ nblocks -= blocks;
+ } while (nblocks);
+}
+
+#define blake2b_mod_init_arch blake2b_mod_init_arch
+static void blake2b_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON)
+ static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
new file mode 100644
index 000000000000..933f0558b7cd
--- /dev/null
+++ b/lib/crypto/arm/blake2s-core.S
@@ -0,0 +1,309 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2s digest algorithm, ARM scalar implementation. This is faster
+ * than the generic implementations of BLAKE2s and BLAKE2b, but slower
+ * than the NEON implementation of BLAKE2b. There is no NEON
+ * implementation of BLAKE2s, since NEON doesn't really help with it.
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ // Registers used to hold message words temporarily. There aren't
+ // enough ARM registers to hold the whole message block, so we have to
+ // load the words on-demand.
+ M_0 .req r12
+ M_1 .req r14
+
+// The BLAKE2s initialization vector
+.Lblake2s_IV:
+ .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+ .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+
+.macro __ldrd a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ ldrd \a, \b, [\src, #\offset]
+#else
+ ldr \a, [\src, #\offset]
+ ldr \b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ strd \a, \b, [\dst, #\offset]
+#else
+ str \a, [\dst, #\offset]
+ str \b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _le32_bswap a, tmp
+#ifdef __ARMEB__
+ rev_l \a, \tmp
+#endif
+.endm
+
+.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp
+ _le32_bswap \a, \tmp
+ _le32_bswap \b, \tmp
+ _le32_bswap \c, \tmp
+ _le32_bswap \d, \tmp
+ _le32_bswap \e, \tmp
+ _le32_bswap \f, \tmp
+ _le32_bswap \g, \tmp
+ _le32_bswap \h, \tmp
+.endm
+
+// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
+// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
+// columns/diagonals. s0-s1 are the word offsets to the message words the first
+// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
+// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
+//
+// Note that to save instructions, the rotations don't happen when the
+// pseudocode says they should, but rather they are delayed until the values are
+// used. See the comment above _blake2s_round().
+.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
+
+ ldr M_0, [sp, #32 + 4 * \s0]
+ ldr M_1, [sp, #32 + 4 * \s2]
+
+ // a += b + m[blake2s_sigma[r][2*i + 0]];
+ add \a0, \a0, \b0, ror #brot
+ add \a1, \a1, \b1, ror #brot
+ add \a0, \a0, M_0
+ add \a1, \a1, M_1
+
+ // d = ror32(d ^ a, 16);
+ eor \d0, \a0, \d0, ror #drot
+ eor \d1, \a1, \d1, ror #drot
+
+ // c += d;
+ add \c0, \c0, \d0, ror #16
+ add \c1, \c1, \d1, ror #16
+
+ // b = ror32(b ^ c, 12);
+ eor \b0, \c0, \b0, ror #brot
+ eor \b1, \c1, \b1, ror #brot
+
+ ldr M_0, [sp, #32 + 4 * \s1]
+ ldr M_1, [sp, #32 + 4 * \s3]
+
+ // a += b + m[blake2s_sigma[r][2*i + 1]];
+ add \a0, \a0, \b0, ror #12
+ add \a1, \a1, \b1, ror #12
+ add \a0, \a0, M_0
+ add \a1, \a1, M_1
+
+ // d = ror32(d ^ a, 8);
+ eor \d0, \a0, \d0, ror#16
+ eor \d1, \a1, \d1, ror#16
+
+ // c += d;
+ add \c0, \c0, \d0, ror#8
+ add \c1, \c1, \d1, ror#8
+
+ // b = ror32(b ^ c, 7);
+ eor \b0, \c0, \b0, ror#12
+ eor \b1, \c1, \b1, ror#12
+.endm
+
+// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
+// are in r0..r9. The stack pointer points to 8 bytes of scratch space for
+// spilling v[8..9], then to v[10..15], then to the message block. r10-r12 and
+// r14 are free to use. The macro arguments s0-s15 give the order in which the
+// message words are used in this round.
+//
+// All rotates are performed using the implicit rotate operand accepted by the
+// 'add' and 'eor' instructions. This is faster than using explicit rotate
+// instructions. To make this work, we allow the values in the second and last
+// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
+// wrong rotation amount. The rotation amount is then fixed up just in time
+// when the values are used. 'brot' is the number of bits the values in row 'b'
+// need to be rotated right to arrive at the correct values, and 'drot'
+// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
+// that they end up as (7, 8) after every round.
+.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
+ s8, s9, s10, s11, s12, s13, s14, s15
+
+ // Mix first two columns:
+ // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
+ __ldrd r10, r11, sp, 16 // load v[12] and v[13]
+ _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
+ \s0, \s1, \s2, \s3
+ __strd r8, r9, sp, 0
+ __strd r10, r11, sp, 16
+
+ // Mix second two columns:
+ // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
+ __ldrd r8, r9, sp, 8 // load v[10] and v[11]
+ __ldrd r10, r11, sp, 24 // load v[14] and v[15]
+ _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
+ \s4, \s5, \s6, \s7
+ str r10, [sp, #24] // store v[14]
+ // v[10], v[11], and v[15] are used below, so no need to store them yet.
+
+ .set brot, 7
+ .set drot, 8
+
+ // Mix first two diagonals:
+ // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
+ ldr r10, [sp, #16] // load v[12]
+ _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
+ \s8, \s9, \s10, \s11
+ __strd r8, r9, sp, 8
+ str r11, [sp, #28]
+ str r10, [sp, #16]
+
+ // Mix second two diagonals:
+ // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
+ __ldrd r8, r9, sp, 0 // load v[8] and v[9]
+ __ldrd r10, r11, sp, 20 // load v[13] and v[14]
+ _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
+ \s12, \s13, \s14, \s15
+ __strd r10, r11, sp, 20
+.endm
+
+//
+// void blake2s_compress(struct blake2s_ctx *ctx,
+// const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+// u32 h[8]; (inout)
+// u32 t[2]; (inout)
+// u32 f[2]; (in)
+//
+ .align 5
+ENTRY(blake2s_compress)
+ push {r0-r2,r4-r11,lr} // keep this an even number
+
+.Lnext_block:
+ // r0 is 'ctx'
+ // r1 is 'data'
+ // r3 is 'inc'
+
+ // Load and increment the counter t[0..1].
+ __ldrd r10, r11, r0, 32
+ adds r10, r10, r3
+ adc r11, r11, #0
+ __strd r10, r11, r0, 32
+
+ // _blake2s_round is very short on registers, so copy the message block
+ // to the stack to save a register during the rounds. This also has the
+ // advantage that misalignment only needs to be dealt with in one place.
+ sub sp, sp, #64
+ mov r12, sp
+ tst r1, #3
+ bne .Lcopy_block_misaligned
+ ldmia r1!, {r2-r9}
+ _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
+ stmia r12!, {r2-r9}
+ ldmia r1!, {r2-r9}
+ _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
+ stmia r12, {r2-r9}
+.Lcopy_block_done:
+ str r1, [sp, #68] // Update message pointer
+
+ // Calculate v[8..15]. Push v[10..15] onto the stack, and leave space
+ // for spilling v[8..9]. Leave v[8..9] in r8-r9.
+ mov r14, r0 // r14 = ctx
+ adr r12, .Lblake2s_IV
+ ldmia r12!, {r8-r9} // load IV[0..1]
+ __ldrd r0, r1, r14, 40 // load f[0..1]
+ ldm r12, {r2-r7} // load IV[2..7]
+ eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
+ eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
+ eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
+ eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
+ push {r2-r7} // push v[10..15]
+ sub sp, sp, #8 // leave space for v[8..9]
+
+ // Load h[0..7] == v[0..7].
+ ldm r14, {r0-r7}
+
+ // Execute the rounds. Each round is provided the order in which it
+ // needs to use the message words.
+ .set brot, 0
+ .set drot, 0
+ _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+ _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+ _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+ _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+ _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+ _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+ _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+ _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+ _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+
+ // Fold the final state matrix into the hash chaining value:
+ //
+ // for (i = 0; i < 8; i++)
+ // h[i] ^= v[i] ^ v[i + 8];
+ //
+ ldr r14, [sp, #96] // r14 = &h[0]
+ add sp, sp, #8 // v[8..9] are already loaded.
+ pop {r10-r11} // load v[10..11]
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ eor r3, r3, r11
+ ldm r14, {r8-r11} // load h[0..3]
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ eor r3, r3, r11
+ stmia r14!, {r0-r3} // store new h[0..3]
+ ldm r14, {r0-r3} // load old h[4..7]
+ pop {r8-r11} // load v[12..15]
+ eor r0, r0, r4, ror #brot
+ eor r1, r1, r5, ror #brot
+ eor r2, r2, r6, ror #brot
+ eor r3, r3, r7, ror #brot
+ eor r0, r0, r8, ror #drot
+ eor r1, r1, r9, ror #drot
+ eor r2, r2, r10, ror #drot
+ eor r3, r3, r11, ror #drot
+ add sp, sp, #64 // skip copy of message block
+ stm r14, {r0-r3} // store new h[4..7]
+
+ // Advance to the next block, if there is one. Note that if there are
+ // multiple blocks, then 'inc' (the counter increment amount) must be
+ // 64. So we can simply set it to 64 without re-loading it.
+ ldm sp, {r0, r1, r2} // load (ctx, data, nblocks)
+ mov r3, #64 // set 'inc'
+ subs r2, r2, #1 // nblocks--
+ str r2, [sp, #8]
+ bne .Lnext_block // nblocks != 0?
+
+ pop {r0-r2,r4-r11,pc}
+
+ // The next message block (pointed to by r1) isn't 4-byte aligned, so it
+ // can't be loaded using ldmia. Copy it to the stack buffer (pointed to
+ // by r12) using an alternative method. r2-r9 are free to use.
+.Lcopy_block_misaligned:
+ mov r2, #64
+1:
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ ldr r3, [r1], #4
+ _le32_bswap r3, r4
+#else
+ ldrb r3, [r1, #0]
+ ldrb r4, [r1, #1]
+ ldrb r5, [r1, #2]
+ ldrb r6, [r1, #3]
+ add r1, r1, #4
+ orr r3, r3, r4, lsl #8
+ orr r3, r3, r5, lsl #16
+ orr r3, r3, r6, lsl #24
+#endif
+ subs r2, r2, #4
+ str r3, [r12], #4
+ bne 1b
+ b .Lcopy_block_done
+ENDPROC(blake2s_compress)
diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h
new file mode 100644
index 000000000000..42c04440c191
--- /dev/null
+++ b/lib/crypto/arm/blake2s.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+/* defined in blake2s-core.S */
+void blake2s_compress(struct blake2s_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc);
diff --git a/lib/crypto/arm/chacha-neon-core.S b/lib/crypto/arm/chacha-neon-core.S
new file mode 100644
index 000000000000..ddd62b6294a5
--- /dev/null
+++ b/lib/crypto/arm/chacha-neon-core.S
@@ -0,0 +1,643 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+ /*
+ * NEON doesn't have a rotate instruction. The alternatives are, more or less:
+ *
+ * (a) vshl.u32 + vsri.u32 (needs temporary register)
+ * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
+ * (c) vrev32.16 (16-bit rotations only)
+ * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
+ * needs index vector)
+ *
+ * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
+ * the only choices are (a) and (b). We use (a) since it takes two-thirds the
+ * cycles of (b) on both Cortex-A7 and Cortex-A53.
+ *
+ * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+ * and doesn't need a temporary register.
+ *
+ * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
+ * is twice as fast as (a), even when doing (a) on multiple registers
+ * simultaneously to eliminate the stall between vshl and vsri. Also, it
+ * parallelizes better when temporary registers are scarce.
+ *
+ * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+ * (a), so the need to load the rotation table actually makes the vtbl method
+ * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
+ * seems to be a good compromise to get a more significant speed boost on some
+ * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+ */
+
+#include <linux/linkage.h>
+#include <asm/cache.h>
+
+ .text
+ .fpu neon
+ .align 5
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3. It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in r3.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha_permute:
+
+ adr ip, .Lrol8_table
+ vld1.8 {d10}, [ip, :64]
+
+.Ldoubleround:
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vrev32.16 q3, q3
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #12
+ vsri.u32 q1, q4, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vtbl.8 d6, {d6}, d10
+ vtbl.8 d7, {d7}, d10
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #7
+ vsri.u32 q1, q4, #25
+
+ // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vext.8 q1, q1, q1, #4
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vext.8 q2, q2, q2, #8
+ // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vext.8 q3, q3, q3, #12
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vrev32.16 q3, q3
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #12
+ vsri.u32 q1, q4, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vtbl.8 d6, {d6}, d10
+ vtbl.8 d7, {d7}, d10
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #7
+ vsri.u32 q1, q4, #25
+
+ // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vext.8 q1, q1, q1, #12
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vext.8 q2, q2, q2, #8
+ // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vext.8 q3, q3, q3, #4
+
+ subs r3, r3, #2
+ bne .Ldoubleround
+
+ bx lr
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+ // r0: Input state matrix, s
+ // r1: 1 data block output, o
+ // r2: 1 data block input, i
+ // r3: nrounds
+ push {lr}
+
+ // x0..3 = s0..3
+ add ip, r0, #0x20
+ vld1.32 {q0-q1}, [r0]
+ vld1.32 {q2-q3}, [ip]
+
+ vmov q8, q0
+ vmov q9, q1
+ vmov q10, q2
+ vmov q11, q3
+
+ bl chacha_permute
+
+ add ip, r2, #0x20
+ vld1.8 {q4-q5}, [r2]
+ vld1.8 {q6-q7}, [ip]
+
+ // o0 = i0 ^ (x0 + s0)
+ vadd.i32 q0, q0, q8
+ veor q0, q0, q4
+
+ // o1 = i1 ^ (x1 + s1)
+ vadd.i32 q1, q1, q9
+ veor q1, q1, q5
+
+ // o2 = i2 ^ (x2 + s2)
+ vadd.i32 q2, q2, q10
+ veor q2, q2, q6
+
+ // o3 = i3 ^ (x3 + s3)
+ vadd.i32 q3, q3, q11
+ veor q3, q3, q7
+
+ add ip, r1, #0x20
+ vst1.8 {q0-q1}, [r1]
+ vst1.8 {q2-q3}, [ip]
+
+ pop {pc}
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+ // r0: Input state matrix, s
+ // r1: output (8 32-bit words)
+ // r2: nrounds
+ push {lr}
+
+ vld1.32 {q0-q1}, [r0]!
+ vld1.32 {q2-q3}, [r0]
+
+ mov r3, r2
+ bl chacha_permute
+
+ vst1.32 {q0}, [r1]!
+ vst1.32 {q3}, [r1]
+
+ pop {pc}
+ENDPROC(hchacha_block_neon)
+
+ .align 4
+.Lctrinc: .word 0, 1, 2, 3
+.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
+
+ .align 5
+ENTRY(chacha_4block_xor_neon)
+ push {r4, lr}
+ mov r4, sp // preserve the stack pointer
+ sub ip, sp, #0x20 // allocate a 32 byte buffer
+ bic ip, ip, #0x1f // aligned to 32 bytes
+ mov sp, ip
+
+ // r0: Input state matrix, s
+ // r1: 4 data blocks output, o
+ // r2: 4 data blocks input, i
+ // r3: nrounds
+
+ //
+ // This function encrypts four consecutive ChaCha blocks by loading
+ // the state matrix in NEON registers four times. The algorithm performs
+ // each operation on the corresponding word of each state matrix, hence
+ // requires no word shuffling. The words are re-interleaved before the
+ // final addition of the original state and the XORing step.
+ //
+
+ // x0..15[0-3] = s0..15[0-3]
+ add ip, r0, #0x20
+ vld1.32 {q0-q1}, [r0]
+ vld1.32 {q2-q3}, [ip]
+
+ adr lr, .Lctrinc
+ vdup.32 q15, d7[1]
+ vdup.32 q14, d7[0]
+ vld1.32 {q4}, [lr, :128]
+ vdup.32 q13, d6[1]
+ vdup.32 q12, d6[0]
+ vdup.32 q11, d5[1]
+ vdup.32 q10, d5[0]
+ vadd.u32 q12, q12, q4 // x12 += counter values 0-3
+ vdup.32 q9, d4[1]
+ vdup.32 q8, d4[0]
+ vdup.32 q7, d3[1]
+ vdup.32 q6, d3[0]
+ vdup.32 q5, d2[1]
+ vdup.32 q4, d2[0]
+ vdup.32 q3, d1[1]
+ vdup.32 q2, d1[0]
+ vdup.32 q1, d0[1]
+ vdup.32 q0, d0[0]
+
+ adr ip, .Lrol8_table
+ b 1f
+
+.Ldoubleround4:
+ vld1.32 {q8-q9}, [sp, :256]
+1:
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vadd.i32 q0, q0, q4
+ vadd.i32 q1, q1, q5
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+
+ veor q12, q12, q0
+ veor q13, q13, q1
+ veor q14, q14, q2
+ veor q15, q15, q3
+
+ vrev32.16 q12, q12
+ vrev32.16 q13, q13
+ vrev32.16 q14, q14
+ vrev32.16 q15, q15
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i32 q10, q10, q14
+ vadd.i32 q11, q11, q15
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q4, q8
+ veor q9, q5, q9
+ vshl.u32 q4, q8, #12
+ vshl.u32 q5, q9, #12
+ vsri.u32 q4, q8, #20
+ vsri.u32 q5, q9, #20
+
+ veor q8, q6, q10
+ veor q9, q7, q11
+ vshl.u32 q6, q8, #12
+ vshl.u32 q7, q9, #12
+ vsri.u32 q6, q8, #20
+ vsri.u32 q7, q9, #20
+
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vld1.8 {d16}, [ip, :64]
+ vadd.i32 q0, q0, q4
+ vadd.i32 q1, q1, q5
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+
+ veor q12, q12, q0
+ veor q13, q13, q1
+ veor q14, q14, q2
+ veor q15, q15, q3
+
+ vtbl.8 d24, {d24}, d16
+ vtbl.8 d25, {d25}, d16
+ vtbl.8 d26, {d26}, d16
+ vtbl.8 d27, {d27}, d16
+ vtbl.8 d28, {d28}, d16
+ vtbl.8 d29, {d29}, d16
+ vtbl.8 d30, {d30}, d16
+ vtbl.8 d31, {d31}, d16
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i32 q10, q10, q14
+ vadd.i32 q11, q11, q15
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q4, q8
+ veor q9, q5, q9
+ vshl.u32 q4, q8, #7
+ vshl.u32 q5, q9, #7
+ vsri.u32 q4, q8, #25
+ vsri.u32 q5, q9, #25
+
+ veor q8, q6, q10
+ veor q9, q7, q11
+ vshl.u32 q6, q8, #7
+ vshl.u32 q7, q9, #7
+ vsri.u32 q6, q8, #25
+ vsri.u32 q7, q9, #25
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vadd.i32 q0, q0, q5
+ vadd.i32 q1, q1, q6
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q4
+
+ veor q15, q15, q0
+ veor q12, q12, q1
+ veor q13, q13, q2
+ veor q14, q14, q3
+
+ vrev32.16 q15, q15
+ vrev32.16 q12, q12
+ vrev32.16 q13, q13
+ vrev32.16 q14, q14
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vadd.i32 q10, q10, q15
+ vadd.i32 q11, q11, q12
+ vadd.i32 q8, q8, q13
+ vadd.i32 q9, q9, q14
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q7, q8
+ veor q9, q4, q9
+ vshl.u32 q7, q8, #12
+ vshl.u32 q4, q9, #12
+ vsri.u32 q7, q8, #20
+ vsri.u32 q4, q9, #20
+
+ veor q8, q5, q10
+ veor q9, q6, q11
+ vshl.u32 q5, q8, #12
+ vshl.u32 q6, q9, #12
+ vsri.u32 q5, q8, #20
+ vsri.u32 q6, q9, #20
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vld1.8 {d16}, [ip, :64]
+ vadd.i32 q0, q0, q5
+ vadd.i32 q1, q1, q6
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q4
+
+ veor q15, q15, q0
+ veor q12, q12, q1
+ veor q13, q13, q2
+ veor q14, q14, q3
+
+ vtbl.8 d30, {d30}, d16
+ vtbl.8 d31, {d31}, d16
+ vtbl.8 d24, {d24}, d16
+ vtbl.8 d25, {d25}, d16
+ vtbl.8 d26, {d26}, d16
+ vtbl.8 d27, {d27}, d16
+ vtbl.8 d28, {d28}, d16
+ vtbl.8 d29, {d29}, d16
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vadd.i32 q10, q10, q15
+ vadd.i32 q11, q11, q12
+ vadd.i32 q8, q8, q13
+ vadd.i32 q9, q9, q14
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q7, q8
+ veor q9, q4, q9
+ vshl.u32 q7, q8, #7
+ vshl.u32 q4, q9, #7
+ vsri.u32 q7, q8, #25
+ vsri.u32 q4, q9, #25
+
+ veor q8, q5, q10
+ veor q9, q6, q11
+ vshl.u32 q5, q8, #7
+ vshl.u32 q6, q9, #7
+ vsri.u32 q5, q8, #25
+ vsri.u32 q6, q9, #25
+
+ subs r3, r3, #2
+ bne .Ldoubleround4
+
+ // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+ // x8..9[0-3] are on the stack.
+
+ // Re-interleave the words in the first two rows of each block (x0..7).
+ // Also add the counter values 0-3 to x12[0-3].
+ vld1.32 {q8}, [lr, :128] // load counter values 0-3
+ vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
+ vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
+ vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
+ vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
+ vadd.u32 q12, q8 // x12 += counter values 0-3
+ vswp d1, d4
+ vswp d3, d6
+ vld1.32 {q8-q9}, [r0]! // load s0..7
+ vswp d9, d12
+ vswp d11, d14
+
+ // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+ // after XORing the first 32 bytes.
+ vswp q1, q4
+
+ // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+ // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
+ vadd.u32 q0, q0, q8
+ vadd.u32 q2, q2, q8
+ vadd.u32 q4, q4, q8
+ vadd.u32 q3, q3, q8
+
+ // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
+ vadd.u32 q1, q1, q9
+ vadd.u32 q6, q6, q9
+ vadd.u32 q5, q5, q9
+ vadd.u32 q7, q7, q9
+
+ // XOR first 32 bytes using keystream from first two rows of first block
+ vld1.8 {q8-q9}, [r2]!
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vst1.8 {q8-q9}, [r1]!
+
+ // Re-interleave the words in the last two rows of each block (x8..15).
+ vld1.32 {q8-q9}, [sp, :256]
+ mov sp, r4 // restore original stack pointer
+ ldr r4, [r4, #8] // load number of bytes
+ vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
+ vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
+ vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
+ vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
+ vld1.32 {q0-q1}, [r0] // load s8..15
+ vswp d25, d28
+ vswp d27, d30
+ vswp d17, d20
+ vswp d19, d22
+
+ // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+ // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
+ vadd.u32 q8, q8, q0
+ vadd.u32 q10, q10, q0
+ vadd.u32 q9, q9, q0
+ vadd.u32 q11, q11, q0
+
+ // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+ vadd.u32 q12, q12, q1
+ vadd.u32 q14, q14, q1
+ vadd.u32 q13, q13, q1
+ vadd.u32 q15, q15, q1
+
+ // XOR the rest of the data with the keystream
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #96
+ veor q0, q0, q8
+ veor q1, q1, q12
+ ble .Lle96
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
+ veor q0, q0, q2
+ veor q1, q1, q6
+ ble .Lle128
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
+ veor q0, q0, q10
+ veor q1, q1, q14
+ ble .Lle160
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
+ veor q0, q0, q4
+ veor q1, q1, q5
+ ble .Lle192
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
+ veor q0, q0, q9
+ veor q1, q1, q13
+ ble .Lle224
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
+ veor q0, q0, q3
+ veor q1, q1, q7
+ blt .Llt256
+.Lout:
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]
+ veor q0, q0, q11
+ veor q1, q1, q15
+ vst1.8 {q0-q1}, [r1]
+
+ pop {r4, pc}
+
+.Lle192:
+ vmov q4, q9
+ vmov q5, q13
+
+.Lle160:
+ // nothing to do
+
+.Lfinalblock:
+ // Process the final block if processing less than 4 full blocks.
+ // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
+ // previous 32 byte output block that still needs to be written at
+ // [r1] in q0-q1.
+ beq .Lfullblock
+
+.Lpartialblock:
+ adr lr, .Lpermute + 32
+ add r2, r2, r4
+ add lr, lr, r4
+ add r4, r4, r1
+
+ vld1.8 {q2-q3}, [lr]
+ vld1.8 {q6-q7}, [r2]
+
+ add r4, r4, #32
+
+ vtbl.8 d4, {q4-q5}, d4
+ vtbl.8 d5, {q4-q5}, d5
+ vtbl.8 d6, {q4-q5}, d6
+ vtbl.8 d7, {q4-q5}, d7
+
+ veor q6, q6, q2
+ veor q7, q7, q3
+
+ vst1.8 {q6-q7}, [r4] // overlapping stores
+ vst1.8 {q0-q1}, [r1]
+ pop {r4, pc}
+
+.Lfullblock:
+ vmov q11, q4
+ vmov q15, q5
+ b .Lout
+.Lle96:
+ vmov q4, q2
+ vmov q5, q6
+ b .Lfinalblock
+.Lle128:
+ vmov q4, q10
+ vmov q5, q14
+ b .Lfinalblock
+.Lle224:
+ vmov q4, q3
+ vmov q5, q7
+ b .Lfinalblock
+.Llt256:
+ vmov q4, q11
+ vmov q5, q15
+ b .Lpartialblock
+ENDPROC(chacha_4block_xor_neon)
+
+ .align L1_CACHE_SHIFT
+.Lpermute:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/lib/crypto/arm/chacha-scalar-core.S b/lib/crypto/arm/chacha-scalar-core.S
new file mode 100644
index 000000000000..4951df05c158
--- /dev/null
+++ b/lib/crypto/arm/chacha-scalar-core.S
@@ -0,0 +1,444 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Google, Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Design notes:
+ *
+ * 16 registers would be needed to hold the state matrix, but only 14 are
+ * available because 'sp' and 'pc' cannot be used. So we spill the elements
+ * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
+ * 'ldrd' and one 'strd' instruction per round.
+ *
+ * All rotates are performed using the implicit rotate operand accepted by the
+ * 'add' and 'eor' instructions. This is faster than using explicit rotate
+ * instructions. To make this work, we allow the values in the second and last
+ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
+ * wrong rotation amount. The rotation amount is then fixed up just in time
+ * when the values are used. 'brot' is the number of bits the values in row 'b'
+ * need to be rotated right to arrive at the correct values, and 'drot'
+ * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
+ * that they end up as (25, 24) after every round.
+ */
+
+ // ChaCha state registers
+ X0 .req r0
+ X1 .req r1
+ X2 .req r2
+ X3 .req r3
+ X4 .req r4
+ X5 .req r5
+ X6 .req r6
+ X7 .req r7
+ X8_X10 .req r8 // shared by x8 and x10
+ X9_X11 .req r9 // shared by x9 and x11
+ X12 .req r10
+ X13 .req r11
+ X14 .req r12
+ X15 .req r14
+
+.macro _le32_bswap_4x a, b, c, d, tmp
+#ifdef __ARMEB__
+ rev_l \a, \tmp
+ rev_l \b, \tmp
+ rev_l \c, \tmp
+ rev_l \d, \tmp
+#endif
+.endm
+
+.macro __ldrd a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ ldrd \a, \b, [\src, #\offset]
+#else
+ ldr \a, [\src, #\offset]
+ ldr \b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ strd \a, \b, [\dst, #\offset]
+#else
+ str \a, [\dst, #\offset]
+ str \b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
+
+ // a += b; d ^= a; d = rol(d, 16);
+ add \a1, \a1, \b1, ror #brot
+ add \a2, \a2, \b2, ror #brot
+ eor \d1, \a1, \d1, ror #drot
+ eor \d2, \a2, \d2, ror #drot
+ // drot == 32 - 16 == 16
+
+ // c += d; b ^= c; b = rol(b, 12);
+ add \c1, \c1, \d1, ror #16
+ add \c2, \c2, \d2, ror #16
+ eor \b1, \c1, \b1, ror #brot
+ eor \b2, \c2, \b2, ror #brot
+ // brot == 32 - 12 == 20
+
+ // a += b; d ^= a; d = rol(d, 8);
+ add \a1, \a1, \b1, ror #20
+ add \a2, \a2, \b2, ror #20
+ eor \d1, \a1, \d1, ror #16
+ eor \d2, \a2, \d2, ror #16
+ // drot == 32 - 8 == 24
+
+ // c += d; b ^= c; b = rol(b, 7);
+ add \c1, \c1, \d1, ror #24
+ add \c2, \c2, \d2, ror #24
+ eor \b1, \c1, \b1, ror #20
+ eor \b2, \c2, \b2, ror #20
+ // brot == 32 - 7 == 25
+.endm
+
+.macro _doubleround
+
+ // column round
+
+ // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
+ _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
+
+ // save (x8, x9); restore (x10, x11)
+ __strd X8_X10, X9_X11, sp, 0
+ __ldrd X8_X10, X9_X11, sp, 8
+
+ // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
+ _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
+
+ .set brot, 25
+ .set drot, 24
+
+ // diagonal round
+
+ // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
+ _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
+
+ // save (x10, x11); restore (x8, x9)
+ __strd X8_X10, X9_X11, sp, 8
+ __ldrd X8_X10, X9_X11, sp, 0
+
+ // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
+ _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
+.endm
+
+.macro _chacha_permute nrounds
+ .set brot, 0
+ .set drot, 0
+ .rept \nrounds / 2
+ _doubleround
+ .endr
+.endm
+
+.macro _chacha nrounds
+
+.Lnext_block\@:
+ // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
+ // Registers contain x0-x9,x12-x15.
+
+ // Do the core ChaCha permutation to update x0-x15.
+ _chacha_permute \nrounds
+
+ add sp, #8
+ // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers contain x0-x9,x12-x15.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
+ push {X8_X10, X9_X11, X12, X13, X14, X15}
+
+ // Load (OUT, IN, LEN).
+ ldr r14, [sp, #96]
+ ldr r12, [sp, #100]
+ ldr r11, [sp, #104]
+
+ orr r10, r14, r12
+
+ // Use slow path if fewer than 64 bytes remain.
+ cmp r11, #64
+ blt .Lxor_slowpath\@
+
+ // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
+ // ARMv6+, since ldmia and stmia (used below) still require alignment.
+ tst r10, #3
+ bne .Lxor_slowpath\@
+
+ // Fast path: XOR 64 bytes of aligned data.
+
+ // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // x0-x3
+ __ldrd r8, r9, sp, 32
+ __ldrd r10, r11, sp, 40
+ add X0, X0, r8
+ add X1, X1, r9
+ add X2, X2, r10
+ add X3, X3, r11
+ _le32_bswap_4x X0, X1, X2, X3, r8
+ ldmia r12!, {r8-r11}
+ eor X0, X0, r8
+ eor X1, X1, r9
+ eor X2, X2, r10
+ eor X3, X3, r11
+ stmia r14!, {X0-X3}
+
+ // x4-x7
+ __ldrd r8, r9, sp, 48
+ __ldrd r10, r11, sp, 56
+ add X4, r8, X4, ror #brot
+ add X5, r9, X5, ror #brot
+ ldmia r12!, {X0-X3}
+ add X6, r10, X6, ror #brot
+ add X7, r11, X7, ror #brot
+ _le32_bswap_4x X4, X5, X6, X7, r8
+ eor X4, X4, X0
+ eor X5, X5, X1
+ eor X6, X6, X2
+ eor X7, X7, X3
+ stmia r14!, {X4-X7}
+
+ // x8-x15
+ pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
+ __ldrd r8, r9, sp, 32
+ __ldrd r10, r11, sp, 40
+ add r0, r0, r8 // x8
+ add r1, r1, r9 // x9
+ add r6, r6, r10 // x10
+ add r7, r7, r11 // x11
+ _le32_bswap_4x r0, r1, r6, r7, r8
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8 // x8
+ eor r1, r1, r9 // x9
+ eor r6, r6, r10 // x10
+ eor r7, r7, r11 // x11
+ stmia r14!, {r0,r1,r6,r7}
+ ldmia r12!, {r0,r1,r6,r7}
+ __ldrd r8, r9, sp, 48
+ __ldrd r10, r11, sp, 56
+ add r2, r8, r2, ror #drot // x12
+ add r3, r9, r3, ror #drot // x13
+ add r4, r10, r4, ror #drot // x14
+ add r5, r11, r5, ror #drot // x15
+ _le32_bswap_4x r2, r3, r4, r5, r9
+ ldr r9, [sp, #72] // load LEN
+ eor r2, r2, r0 // x12
+ eor r3, r3, r1 // x13
+ eor r4, r4, r6 // x14
+ eor r5, r5, r7 // x15
+ subs r9, #64 // decrement and check LEN
+ stmia r14!, {r2-r5}
+
+ beq .Ldone\@
+
+.Lprepare_for_next_block\@:
+
+ // Stack: x0-x15 OUT IN LEN
+
+ // Increment block counter (x12)
+ add r8, #1
+
+ // Store updated (OUT, IN, LEN)
+ str r14, [sp, #64]
+ str r12, [sp, #68]
+ str r9, [sp, #72]
+
+ mov r14, sp
+
+ // Store updated block counter (x12)
+ str r8, [sp, #48]
+
+ sub sp, #16
+
+ // Reload state and do next block
+ ldmia r14!, {r0-r11} // load x0-x11
+ __strd r10, r11, sp, 8 // store x10-x11 before state
+ ldmia r14, {r10-r12,r14} // load x12-x15
+ b .Lnext_block\@
+
+.Lxor_slowpath\@:
+ // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
+ // We handle it by storing the 64 bytes of keystream to the stack, then
+ // XOR-ing the needed portion with the data.
+
+ // Allocate keystream buffer
+ sub sp, #64
+ mov r14, sp
+
+ // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // Save keystream for x0-x3
+ __ldrd r8, r9, sp, 96
+ __ldrd r10, r11, sp, 104
+ add X0, X0, r8
+ add X1, X1, r9
+ add X2, X2, r10
+ add X3, X3, r11
+ _le32_bswap_4x X0, X1, X2, X3, r8
+ stmia r14!, {X0-X3}
+
+ // Save keystream for x4-x7
+ __ldrd r8, r9, sp, 112
+ __ldrd r10, r11, sp, 120
+ add X4, r8, X4, ror #brot
+ add X5, r9, X5, ror #brot
+ add X6, r10, X6, ror #brot
+ add X7, r11, X7, ror #brot
+ _le32_bswap_4x X4, X5, X6, X7, r8
+ add r8, sp, #64
+ stmia r14!, {X4-X7}
+
+ // Save keystream for x8-x15
+ ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
+ __ldrd r8, r9, sp, 128
+ __ldrd r10, r11, sp, 136
+ add r0, r0, r8 // x8
+ add r1, r1, r9 // x9
+ add r6, r6, r10 // x10
+ add r7, r7, r11 // x11
+ _le32_bswap_4x r0, r1, r6, r7, r8
+ stmia r14!, {r0,r1,r6,r7}
+ __ldrd r8, r9, sp, 144
+ __ldrd r10, r11, sp, 152
+ add r2, r8, r2, ror #drot // x12
+ add r3, r9, r3, ror #drot // x13
+ add r4, r10, r4, ror #drot // x14
+ add r5, r11, r5, ror #drot // x15
+ _le32_bswap_4x r2, r3, r4, r5, r9
+ stmia r14, {r2-r5}
+
+ // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
+ // Registers: r8 is block counter, r12 is IN.
+
+ ldr r9, [sp, #168] // LEN
+ ldr r14, [sp, #160] // OUT
+ cmp r9, #64
+ mov r0, sp
+ movle r1, r9
+ movgt r1, #64
+ // r1 is number of bytes to XOR, in range [1, 64]
+
+.if __LINUX_ARM_ARCH__ < 6
+ orr r2, r12, r14
+ tst r2, #3 // IN or OUT misaligned?
+ bne .Lxor_next_byte\@
+.endif
+
+ // XOR a word at a time
+.rept 16
+ subs r1, #4
+ blt .Lxor_words_done\@
+ ldr r2, [r12], #4
+ ldr r3, [r0], #4
+ eor r2, r2, r3
+ str r2, [r14], #4
+.endr
+ b .Lxor_slowpath_done\@
+.Lxor_words_done\@:
+ ands r1, r1, #3
+ beq .Lxor_slowpath_done\@
+
+ // XOR a byte at a time
+.Lxor_next_byte\@:
+ ldrb r2, [r12], #1
+ ldrb r3, [r0], #1
+ eor r2, r2, r3
+ strb r2, [r14], #1
+ subs r1, #1
+ bne .Lxor_next_byte\@
+
+.Lxor_slowpath_done\@:
+ subs r9, #64
+ add sp, #96
+ bgt .Lprepare_for_next_block\@
+
+.Ldone\@:
+.endm // _chacha
+
+/*
+ * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ * const struct chacha_state *state, int nrounds);
+ */
+ENTRY(chacha_doarm)
+ cmp r2, #0 // len == 0?
+ reteq lr
+
+ ldr ip, [sp]
+ cmp ip, #12
+
+ push {r0-r2,r4-r11,lr}
+
+ // Push state x0-x15 onto stack.
+ // Also store an extra copy of x10-x11 just before the state.
+
+ add X12, r3, #48
+ ldm X12, {X12,X13,X14,X15}
+ push {X12,X13,X14,X15}
+ sub sp, sp, #64
+
+ __ldrd X8_X10, X9_X11, r3, 40
+ __strd X8_X10, X9_X11, sp, 8
+ __strd X8_X10, X9_X11, sp, 56
+ ldm r3, {X0-X9_X11}
+ __strd X0, X1, sp, 16
+ __strd X2, X3, sp, 24
+ __strd X4, X5, sp, 32
+ __strd X6, X7, sp, 40
+ __strd X8_X10, X9_X11, sp, 48
+
+ beq 1f
+ _chacha 20
+
+0: add sp, #76
+ pop {r4-r11, pc}
+
+1: _chacha 12
+ b 0b
+ENDPROC(chacha_doarm)
+
+/*
+ * void hchacha_block_arm(const struct chacha_state *state,
+ * u32 out[HCHACHA_OUT_WORDS], int nrounds);
+ */
+ENTRY(hchacha_block_arm)
+ push {r1,r4-r11,lr}
+
+ cmp r2, #12 // ChaCha12 ?
+
+ mov r14, r0
+ ldmia r14!, {r0-r11} // load x0-x11
+ push {r10-r11} // store x10-x11 to stack
+ ldm r14, {r10-r12,r14} // load x12-x15
+ sub sp, #8
+
+ beq 1f
+ _chacha_permute 20
+
+ // Skip over (unused0-unused1, x10-x11)
+0: add sp, #16
+
+ // Fix up rotations of x12-x15
+ ror X12, X12, #drot
+ ror X13, X13, #drot
+ pop {r4} // load 'out'
+ ror X14, X14, #drot
+ ror X15, X15, #drot
+
+ // Store (x0-x3,x12-x15) to 'out'
+ stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+
+ pop {r4-r11,pc}
+
+1: _chacha_permute 12
+ b 0b
+ENDPROC(hchacha_block_arm)
diff --git a/lib/crypto/arm/chacha.h b/lib/crypto/arm/chacha.h
new file mode 100644
index 000000000000..836e49088e98
--- /dev/null
+++ b/lib/crypto/arm/chacha.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ChaCha and HChaCha functions (ARM optimized)
+ *
+ * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <asm/cputype.h>
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+ u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ int nrounds, unsigned int nbytes);
+asmlinkage void hchacha_block_arm(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ const struct chacha_state *state, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
+
+static inline bool neon_usable(void)
+{
+ return static_branch_likely(&use_neon) && crypto_simd_usable();
+}
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ u8 buf[CHACHA_BLOCK_SIZE];
+
+ while (bytes > CHACHA_BLOCK_SIZE) {
+ unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+ }
+ if (bytes) {
+ const u8 *s = src;
+ u8 *d = dst;
+
+ if (bytes != CHACHA_BLOCK_SIZE)
+ s = d = memcpy(buf, src, bytes);
+ chacha_block_xor_neon(state, d, s, nrounds);
+ if (d != dst)
+ memcpy(dst, buf, bytes);
+ state->x[12]++;
+ }
+}
+
+static void hchacha_block_arch(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+ if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
+ hchacha_block_arm(state, out, nrounds);
+ } else {
+ scoped_ksimd()
+ hchacha_block_neon(state, out, nrounds);
+ }
+}
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
+ bytes <= CHACHA_BLOCK_SIZE) {
+ chacha_doarm(dst, src, bytes, state, nrounds);
+ state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
+ return;
+ }
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ scoped_ksimd()
+ chacha_doneon(state, dst, src, todo, nrounds);
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+ switch (read_cpuid_part()) {
+ case ARM_CPU_PART_CORTEX_A7:
+ case ARM_CPU_PART_CORTEX_A5:
+ /*
+ * The Cortex-A7 and Cortex-A5 do not perform well with
+ * the NEON implementation but do incredibly with the
+ * scalar one and use less power.
+ */
+ break;
+ default:
+ static_branch_enable(&use_neon);
+ }
+ }
+}
diff --git a/lib/crypto/arm/curve25519-core.S b/lib/crypto/arm/curve25519-core.S
new file mode 100644
index 000000000000..b697fa5d059a
--- /dev/null
+++ b/lib/crypto/arm/curve25519-core.S
@@ -0,0 +1,2062 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
+ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
+ * manually reworked for use in kernel space.
+ */
+
+#include <linux/linkage.h>
+
+.text
+.arch armv7-a
+.fpu neon
+.align 4
+
+ENTRY(curve25519_neon)
+ push {r4-r11, lr}
+ mov ip, sp
+ sub r3, sp, #704
+ and r3, r3, #0xfffffff0
+ mov sp, r3
+ movw r4, #0
+ movw r5, #254
+ vmov.i32 q0, #1
+ vshr.u64 q1, q0, #7
+ vshr.u64 q0, q0, #8
+ vmov.i32 d4, #19
+ vmov.i32 d5, #38
+ add r6, sp, #480
+ vst1.8 {d2-d3}, [r6, : 128]!
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d4-d5}, [r6, : 128]
+ add r6, r3, #0
+ vmov.i32 q2, #0
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 d4, [r6, : 64]
+ add r6, r3, #0
+ movw r7, #960
+ sub r7, r7, #2
+ neg r7, r7
+ sub r7, r7, r7, LSL #7
+ str r7, [r6]
+ add r6, sp, #672
+ vld1.8 {d4-d5}, [r1]!
+ vld1.8 {d6-d7}, [r1]
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 {d6-d7}, [r6, : 128]
+ sub r1, r6, #16
+ ldrb r6, [r1]
+ and r6, r6, #248
+ strb r6, [r1]
+ ldrb r6, [r1, #31]
+ and r6, r6, #127
+ orr r6, r6, #64
+ strb r6, [r1, #31]
+ vmov.i64 q2, #0xffffffff
+ vshr.u64 q3, q2, #7
+ vshr.u64 q2, q2, #6
+ vld1.8 {d8}, [r2]
+ vld1.8 {d10}, [r2]
+ add r2, r2, #6
+ vld1.8 {d12}, [r2]
+ vld1.8 {d14}, [r2]
+ add r2, r2, #6
+ vld1.8 {d16}, [r2]
+ add r2, r2, #4
+ vld1.8 {d18}, [r2]
+ vld1.8 {d20}, [r2]
+ add r2, r2, #6
+ vld1.8 {d22}, [r2]
+ add r2, r2, #2
+ vld1.8 {d24}, [r2]
+ vld1.8 {d26}, [r2]
+ vshr.u64 q5, q5, #26
+ vshr.u64 q6, q6, #3
+ vshr.u64 q7, q7, #29
+ vshr.u64 q8, q8, #6
+ vshr.u64 q10, q10, #25
+ vshr.u64 q11, q11, #3
+ vshr.u64 q12, q12, #12
+ vshr.u64 q13, q13, #38
+ vand q4, q4, q2
+ vand q6, q6, q2
+ vand q8, q8, q2
+ vand q10, q10, q2
+ vand q2, q12, q2
+ vand q5, q5, q3
+ vand q7, q7, q3
+ vand q9, q9, q3
+ vand q11, q11, q3
+ vand q3, q13, q3
+ add r2, r3, #48
+ vadd.i64 q12, q4, q1
+ vadd.i64 q13, q10, q1
+ vshr.s64 q12, q12, #26
+ vshr.s64 q13, q13, #26
+ vadd.i64 q5, q5, q12
+ vshl.i64 q12, q12, #26
+ vadd.i64 q14, q5, q0
+ vadd.i64 q11, q11, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q11, q0
+ vsub.i64 q4, q4, q12
+ vshr.s64 q12, q14, #25
+ vsub.i64 q10, q10, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q12
+ vshl.i64 q12, q12, #25
+ vadd.i64 q14, q6, q1
+ vadd.i64 q2, q2, q13
+ vsub.i64 q5, q5, q12
+ vshr.s64 q12, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q1
+ vadd.i64 q7, q7, q12
+ vshl.i64 q12, q12, #26
+ vadd.i64 q15, q7, q0
+ vsub.i64 q11, q11, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q12
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q3, q0
+ vadd.i64 q8, q8, q12
+ vshl.i64 q12, q12, #25
+ vadd.i64 q15, q8, q1
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q7, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q9, q9, q12
+ vtrn.32 d12, d14
+ vshl.i64 q12, q12, #26
+ vtrn.32 d13, d15
+ vadd.i64 q0, q9, q0
+ vadd.i64 q4, q4, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q6, q13, #4
+ vsub.i64 q7, q8, q12
+ vshr.s64 q0, q0, #25
+ vadd.i64 q4, q4, q6
+ vadd.i64 q6, q10, q0
+ vshl.i64 q0, q0, #25
+ vadd.i64 q8, q6, q1
+ vadd.i64 q4, q4, q13
+ vshl.i64 q10, q13, #25
+ vadd.i64 q1, q4, q1
+ vsub.i64 q0, q9, q0
+ vshr.s64 q8, q8, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d14, d0
+ vshr.s64 q1, q1, #26
+ vtrn.32 d15, d1
+ vadd.i64 q0, q11, q8
+ vst1.8 d14, [r2, : 64]
+ vshl.i64 q7, q8, #26
+ vadd.i64 q5, q5, q1
+ vtrn.32 d4, d6
+ vshl.i64 q1, q1, #26
+ vtrn.32 d5, d7
+ vsub.i64 q3, q6, q7
+ add r2, r2, #16
+ vsub.i64 q1, q4, q1
+ vst1.8 d4, [r2, : 64]
+ vtrn.32 d6, d0
+ vtrn.32 d7, d1
+ sub r2, r2, #8
+ vtrn.32 d2, d10
+ vtrn.32 d3, d11
+ vst1.8 d6, [r2, : 64]
+ sub r2, r2, #24
+ vst1.8 d2, [r2, : 64]
+ add r2, r3, #96
+ vmov.i32 q0, #0
+ vmov.i64 d2, #0xff
+ vmov.i64 d3, #0
+ vshr.u32 q1, q1, #7
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #144
+ vmov.i32 q0, #0
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #240
+ vmov.i32 q0, #0
+ vmov.i64 d2, #0xff
+ vmov.i64 d3, #0
+ vshr.u32 q1, q1, #7
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #48
+ add r6, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d2-d3}, [r6, : 128]!
+ vst1.8 d4, [r6, : 64]
+.Lmainloop:
+ mov r2, r5, LSR #3
+ and r6, r5, #7
+ ldrb r2, [r1, r2]
+ mov r2, r2, LSR r6
+ and r2, r2, #1
+ str r5, [sp, #456]
+ eor r4, r4, r2
+ str r2, [sp, #460]
+ neg r2, r4
+ add r4, r3, #96
+ add r5, r3, #192
+ add r6, r3, #144
+ vld1.8 {d8-d9}, [r4, : 128]!
+ add r7, r3, #240
+ vld1.8 {d10-d11}, [r5, : 128]!
+ veor q6, q4, q5
+ vld1.8 {d14-d15}, [r6, : 128]!
+ vdup.i32 q8, r2
+ vld1.8 {d18-d19}, [r7, : 128]!
+ veor q10, q7, q9
+ vld1.8 {d22-d23}, [r4, : 128]!
+ vand q6, q6, q8
+ vld1.8 {d24-d25}, [r5, : 128]!
+ vand q10, q10, q8
+ vld1.8 {d26-d27}, [r6, : 128]!
+ veor q4, q4, q6
+ vld1.8 {d28-d29}, [r7, : 128]!
+ veor q5, q5, q6
+ vld1.8 {d0}, [r4, : 64]
+ veor q6, q7, q10
+ vld1.8 {d2}, [r5, : 64]
+ veor q7, q9, q10
+ vld1.8 {d4}, [r6, : 64]
+ veor q9, q11, q12
+ vld1.8 {d6}, [r7, : 64]
+ veor q10, q0, q1
+ sub r2, r4, #32
+ vand q9, q9, q8
+ sub r4, r5, #32
+ vand q10, q10, q8
+ sub r5, r6, #32
+ veor q11, q11, q9
+ sub r6, r7, #32
+ veor q0, q0, q10
+ veor q9, q12, q9
+ veor q1, q1, q10
+ veor q10, q13, q14
+ veor q12, q2, q3
+ vand q10, q10, q8
+ vand q8, q12, q8
+ veor q12, q13, q10
+ veor q2, q2, q8
+ veor q10, q14, q10
+ veor q3, q3, q8
+ vadd.i32 q8, q4, q6
+ vsub.i32 q4, q4, q6
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vadd.i32 q6, q11, q12
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vsub.i32 q4, q11, q12
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vadd.i32 q6, q0, q2
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vsub.i32 q0, q0, q2
+ vst1.8 d12, [r2, : 64]
+ vadd.i32 q2, q5, q7
+ vst1.8 d0, [r5, : 64]
+ vsub.i32 q0, q5, q7
+ vst1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q2, q9, q10
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vsub.i32 q0, q9, q10
+ vst1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q2, q1, q3
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vsub.i32 q0, q1, q3
+ vst1.8 d4, [r4, : 64]
+ vst1.8 d0, [r6, : 64]
+ add r2, sp, #512
+ add r4, r3, #96
+ add r5, r3, #144
+ vld1.8 {d0-d1}, [r2, : 128]
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vzip.i32 q1, q2
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vld1.8 {d8-d9}, [r5, : 128]!
+ vshl.i32 q5, q1, #1
+ vzip.i32 q3, q4
+ vshl.i32 q6, q2, #1
+ vld1.8 {d14}, [r4, : 64]
+ vshl.i32 q8, q3, #1
+ vld1.8 {d15}, [r5, : 64]
+ vshl.i32 q9, q4, #1
+ vmul.i32 d21, d7, d1
+ vtrn.32 d14, d15
+ vmul.i32 q11, q4, q0
+ vmul.i32 q0, q7, q0
+ vmull.s32 q12, d2, d2
+ vmlal.s32 q12, d11, d1
+ vmlal.s32 q12, d12, d0
+ vmlal.s32 q12, d13, d23
+ vmlal.s32 q12, d16, d22
+ vmlal.s32 q12, d7, d21
+ vmull.s32 q10, d2, d11
+ vmlal.s32 q10, d4, d1
+ vmlal.s32 q10, d13, d0
+ vmlal.s32 q10, d6, d23
+ vmlal.s32 q10, d17, d22
+ vmull.s32 q13, d10, d4
+ vmlal.s32 q13, d11, d3
+ vmlal.s32 q13, d13, d1
+ vmlal.s32 q13, d16, d0
+ vmlal.s32 q13, d17, d23
+ vmlal.s32 q13, d8, d22
+ vmull.s32 q1, d10, d5
+ vmlal.s32 q1, d11, d4
+ vmlal.s32 q1, d6, d1
+ vmlal.s32 q1, d17, d0
+ vmlal.s32 q1, d8, d23
+ vmull.s32 q14, d10, d6
+ vmlal.s32 q14, d11, d13
+ vmlal.s32 q14, d4, d4
+ vmlal.s32 q14, d17, d1
+ vmlal.s32 q14, d18, d0
+ vmlal.s32 q14, d9, d23
+ vmull.s32 q11, d10, d7
+ vmlal.s32 q11, d11, d6
+ vmlal.s32 q11, d12, d5
+ vmlal.s32 q11, d8, d1
+ vmlal.s32 q11, d19, d0
+ vmull.s32 q15, d10, d8
+ vmlal.s32 q15, d11, d17
+ vmlal.s32 q15, d12, d6
+ vmlal.s32 q15, d13, d5
+ vmlal.s32 q15, d19, d1
+ vmlal.s32 q15, d14, d0
+ vmull.s32 q2, d10, d9
+ vmlal.s32 q2, d11, d8
+ vmlal.s32 q2, d12, d7
+ vmlal.s32 q2, d13, d6
+ vmlal.s32 q2, d14, d1
+ vmull.s32 q0, d15, d1
+ vmlal.s32 q0, d10, d14
+ vmlal.s32 q0, d11, d19
+ vmlal.s32 q0, d12, d8
+ vmlal.s32 q0, d13, d17
+ vmlal.s32 q0, d6, d6
+ add r2, sp, #480
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vmull.s32 q3, d16, d7
+ vmlal.s32 q3, d10, d15
+ vmlal.s32 q3, d11, d14
+ vmlal.s32 q3, d12, d9
+ vmlal.s32 q3, d13, d8
+ vld1.8 {d8-d9}, [r2, : 128]
+ vadd.i64 q5, q12, q9
+ vadd.i64 q6, q15, q9
+ vshr.s64 q5, q5, #26
+ vshr.s64 q6, q6, #26
+ vadd.i64 q7, q10, q5
+ vshl.i64 q5, q5, #26
+ vadd.i64 q8, q7, q4
+ vadd.i64 q2, q2, q6
+ vshl.i64 q6, q6, #26
+ vadd.i64 q10, q2, q4
+ vsub.i64 q5, q12, q5
+ vshr.s64 q8, q8, #25
+ vsub.i64 q6, q15, q6
+ vshr.s64 q10, q10, #25
+ vadd.i64 q12, q13, q8
+ vshl.i64 q8, q8, #25
+ vadd.i64 q13, q12, q9
+ vadd.i64 q0, q0, q10
+ vsub.i64 q7, q7, q8
+ vshr.s64 q8, q13, #26
+ vshl.i64 q10, q10, #25
+ vadd.i64 q13, q0, q9
+ vadd.i64 q1, q1, q8
+ vshl.i64 q8, q8, #26
+ vadd.i64 q15, q1, q4
+ vsub.i64 q2, q2, q10
+ vshr.s64 q10, q13, #26
+ vsub.i64 q8, q12, q8
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q10
+ vshl.i64 q10, q10, #26
+ vadd.i64 q13, q3, q4
+ vadd.i64 q14, q14, q12
+ add r2, r3, #288
+ vshl.i64 q12, q12, #25
+ add r4, r3, #336
+ vadd.i64 q15, q14, q9
+ add r2, r2, #8
+ vsub.i64 q0, q0, q10
+ add r4, r4, #8
+ vshr.s64 q10, q13, #25
+ vsub.i64 q1, q1, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q13, q10, q10
+ vadd.i64 q11, q11, q12
+ vtrn.32 d16, d2
+ vshl.i64 q12, q12, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q11, q4
+ vadd.i64 q4, q5, q13
+ vst1.8 d16, [r2, : 64]!
+ vshl.i64 q5, q10, #4
+ vst1.8 d17, [r4, : 64]!
+ vsub.i64 q8, q14, q12
+ vshr.s64 q1, q1, #25
+ vadd.i64 q4, q4, q5
+ vadd.i64 q5, q6, q1
+ vshl.i64 q1, q1, #25
+ vadd.i64 q6, q5, q9
+ vadd.i64 q4, q4, q10
+ vshl.i64 q10, q10, #25
+ vadd.i64 q9, q4, q9
+ vsub.i64 q1, q11, q1
+ vshr.s64 q6, q6, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d16, d2
+ vshr.s64 q9, q9, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q2, q6
+ vst1.8 d16, [r2, : 64]
+ vshl.i64 q2, q6, #26
+ vst1.8 d17, [r4, : 64]
+ vadd.i64 q6, q7, q9
+ vtrn.32 d0, d6
+ vshl.i64 q7, q9, #26
+ vtrn.32 d1, d7
+ vsub.i64 q2, q5, q2
+ add r2, r2, #16
+ vsub.i64 q3, q4, q7
+ vst1.8 d0, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d1, [r4, : 64]
+ vtrn.32 d4, d2
+ vtrn.32 d5, d3
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d6, d12
+ vtrn.32 d7, d13
+ vst1.8 d4, [r2, : 64]
+ vst1.8 d5, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d6, [r2, : 64]
+ vst1.8 d7, [r4, : 64]
+ add r2, r3, #240
+ add r4, r3, #96
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #144
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #192
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q6, q13, #1
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vshl.i32 q10, q14, #1
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ vst1.8 {d14-d15}, [r2, : 128]!
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ vst1.8 {d10-d11}, [r2, : 128]!
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #144
+ vshl.i64 q7, q7, #25
+ add r4, r3, #96
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ add r2, r3, #288
+ add r4, r3, #336
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vsub.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4-d5}, [r4, : 128]!
+ vsub.i32 q1, q1, q2
+ add r5, r3, #240
+ vld1.8 {d4}, [r2, : 64]
+ vld1.8 {d6}, [r4, : 64]
+ vsub.i32 q2, q2, q3
+ vst1.8 {d0-d1}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r5, : 128]!
+ vst1.8 d4, [r5, : 64]
+ add r2, r3, #144
+ add r4, r3, #96
+ add r5, r3, #144
+ add r6, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vsub.i32 q2, q0, q1
+ vadd.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vsub.i32 q4, q1, q3
+ vadd.i32 q1, q1, q3
+ vld1.8 {d6}, [r2, : 64]
+ vld1.8 {d10}, [r4, : 64]
+ vsub.i32 q6, q3, q5
+ vadd.i32 q3, q3, q5
+ vst1.8 {d4-d5}, [r5, : 128]!
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r6, : 128]!
+ vst1.8 d12, [r5, : 64]
+ vst1.8 d6, [r6, : 64]
+ add r2, r3, #0
+ add r4, r3, #240
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #336
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #288
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q6, q13, #1
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vshl.i32 q10, q14, #1
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ vst1.8 {d14-d15}, [r2, : 128]!
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ vst1.8 {d10-d11}, [r2, : 128]!
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #288
+ vshl.i64 q7, q7, #25
+ add r4, r3, #96
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ add r2, sp, #512
+ add r4, r3, #144
+ add r5, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vzip.i32 q1, q2
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vld1.8 {d8-d9}, [r5, : 128]!
+ vshl.i32 q5, q1, #1
+ vzip.i32 q3, q4
+ vshl.i32 q6, q2, #1
+ vld1.8 {d14}, [r4, : 64]
+ vshl.i32 q8, q3, #1
+ vld1.8 {d15}, [r5, : 64]
+ vshl.i32 q9, q4, #1
+ vmul.i32 d21, d7, d1
+ vtrn.32 d14, d15
+ vmul.i32 q11, q4, q0
+ vmul.i32 q0, q7, q0
+ vmull.s32 q12, d2, d2
+ vmlal.s32 q12, d11, d1
+ vmlal.s32 q12, d12, d0
+ vmlal.s32 q12, d13, d23
+ vmlal.s32 q12, d16, d22
+ vmlal.s32 q12, d7, d21
+ vmull.s32 q10, d2, d11
+ vmlal.s32 q10, d4, d1
+ vmlal.s32 q10, d13, d0
+ vmlal.s32 q10, d6, d23
+ vmlal.s32 q10, d17, d22
+ vmull.s32 q13, d10, d4
+ vmlal.s32 q13, d11, d3
+ vmlal.s32 q13, d13, d1
+ vmlal.s32 q13, d16, d0
+ vmlal.s32 q13, d17, d23
+ vmlal.s32 q13, d8, d22
+ vmull.s32 q1, d10, d5
+ vmlal.s32 q1, d11, d4
+ vmlal.s32 q1, d6, d1
+ vmlal.s32 q1, d17, d0
+ vmlal.s32 q1, d8, d23
+ vmull.s32 q14, d10, d6
+ vmlal.s32 q14, d11, d13
+ vmlal.s32 q14, d4, d4
+ vmlal.s32 q14, d17, d1
+ vmlal.s32 q14, d18, d0
+ vmlal.s32 q14, d9, d23
+ vmull.s32 q11, d10, d7
+ vmlal.s32 q11, d11, d6
+ vmlal.s32 q11, d12, d5
+ vmlal.s32 q11, d8, d1
+ vmlal.s32 q11, d19, d0
+ vmull.s32 q15, d10, d8
+ vmlal.s32 q15, d11, d17
+ vmlal.s32 q15, d12, d6
+ vmlal.s32 q15, d13, d5
+ vmlal.s32 q15, d19, d1
+ vmlal.s32 q15, d14, d0
+ vmull.s32 q2, d10, d9
+ vmlal.s32 q2, d11, d8
+ vmlal.s32 q2, d12, d7
+ vmlal.s32 q2, d13, d6
+ vmlal.s32 q2, d14, d1
+ vmull.s32 q0, d15, d1
+ vmlal.s32 q0, d10, d14
+ vmlal.s32 q0, d11, d19
+ vmlal.s32 q0, d12, d8
+ vmlal.s32 q0, d13, d17
+ vmlal.s32 q0, d6, d6
+ add r2, sp, #480
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vmull.s32 q3, d16, d7
+ vmlal.s32 q3, d10, d15
+ vmlal.s32 q3, d11, d14
+ vmlal.s32 q3, d12, d9
+ vmlal.s32 q3, d13, d8
+ vld1.8 {d8-d9}, [r2, : 128]
+ vadd.i64 q5, q12, q9
+ vadd.i64 q6, q15, q9
+ vshr.s64 q5, q5, #26
+ vshr.s64 q6, q6, #26
+ vadd.i64 q7, q10, q5
+ vshl.i64 q5, q5, #26
+ vadd.i64 q8, q7, q4
+ vadd.i64 q2, q2, q6
+ vshl.i64 q6, q6, #26
+ vadd.i64 q10, q2, q4
+ vsub.i64 q5, q12, q5
+ vshr.s64 q8, q8, #25
+ vsub.i64 q6, q15, q6
+ vshr.s64 q10, q10, #25
+ vadd.i64 q12, q13, q8
+ vshl.i64 q8, q8, #25
+ vadd.i64 q13, q12, q9
+ vadd.i64 q0, q0, q10
+ vsub.i64 q7, q7, q8
+ vshr.s64 q8, q13, #26
+ vshl.i64 q10, q10, #25
+ vadd.i64 q13, q0, q9
+ vadd.i64 q1, q1, q8
+ vshl.i64 q8, q8, #26
+ vadd.i64 q15, q1, q4
+ vsub.i64 q2, q2, q10
+ vshr.s64 q10, q13, #26
+ vsub.i64 q8, q12, q8
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q10
+ vshl.i64 q10, q10, #26
+ vadd.i64 q13, q3, q4
+ vadd.i64 q14, q14, q12
+ add r2, r3, #144
+ vshl.i64 q12, q12, #25
+ add r4, r3, #192
+ vadd.i64 q15, q14, q9
+ add r2, r2, #8
+ vsub.i64 q0, q0, q10
+ add r4, r4, #8
+ vshr.s64 q10, q13, #25
+ vsub.i64 q1, q1, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q13, q10, q10
+ vadd.i64 q11, q11, q12
+ vtrn.32 d16, d2
+ vshl.i64 q12, q12, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q11, q4
+ vadd.i64 q4, q5, q13
+ vst1.8 d16, [r2, : 64]!
+ vshl.i64 q5, q10, #4
+ vst1.8 d17, [r4, : 64]!
+ vsub.i64 q8, q14, q12
+ vshr.s64 q1, q1, #25
+ vadd.i64 q4, q4, q5
+ vadd.i64 q5, q6, q1
+ vshl.i64 q1, q1, #25
+ vadd.i64 q6, q5, q9
+ vadd.i64 q4, q4, q10
+ vshl.i64 q10, q10, #25
+ vadd.i64 q9, q4, q9
+ vsub.i64 q1, q11, q1
+ vshr.s64 q6, q6, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d16, d2
+ vshr.s64 q9, q9, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q2, q6
+ vst1.8 d16, [r2, : 64]
+ vshl.i64 q2, q6, #26
+ vst1.8 d17, [r4, : 64]
+ vadd.i64 q6, q7, q9
+ vtrn.32 d0, d6
+ vshl.i64 q7, q9, #26
+ vtrn.32 d1, d7
+ vsub.i64 q2, q5, q2
+ add r2, r2, #16
+ vsub.i64 q3, q4, q7
+ vst1.8 d0, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d1, [r4, : 64]
+ vtrn.32 d4, d2
+ vtrn.32 d5, d3
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d6, d12
+ vtrn.32 d7, d13
+ vst1.8 d4, [r2, : 64]
+ vst1.8 d5, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d6, [r2, : 64]
+ vst1.8 d7, [r4, : 64]
+ add r2, r3, #336
+ add r4, r3, #288
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vadd.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q1, q1, q2
+ add r5, r3, #288
+ vld1.8 {d4}, [r2, : 64]
+ vld1.8 {d6}, [r4, : 64]
+ vadd.i32 q2, q2, q3
+ vst1.8 {d0-d1}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r5, : 128]!
+ vst1.8 d4, [r5, : 64]
+ add r2, r3, #48
+ add r4, r3, #144
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #288
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #240
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q6, q13, #1
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vshl.i32 q10, q14, #1
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ vst1.8 {d14-d15}, [r2, : 128]!
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ vst1.8 {d10-d11}, [r2, : 128]!
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #240
+ vshl.i64 q7, q7, #25
+ add r4, r3, #144
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ ldr r2, [sp, #456]
+ ldr r4, [sp, #460]
+ subs r5, r2, #1
+ bge .Lmainloop
+ add r1, r3, #144
+ add r2, r3, #336
+ vld1.8 {d0-d1}, [r1, : 128]!
+ vld1.8 {d2-d3}, [r1, : 128]!
+ vld1.8 {d4}, [r1, : 64]
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 d4, [r2, : 64]
+ movw r1, #0
+.Linvertloop:
+ add r2, r3, #144
+ movw r4, #0
+ movw r5, #2
+ cmp r1, #1
+ moveq r5, #1
+ addeq r2, r3, #336
+ addeq r4, r3, #48
+ cmp r1, #2
+ moveq r5, #1
+ addeq r2, r3, #48
+ cmp r1, #3
+ moveq r5, #5
+ addeq r4, r3, #336
+ cmp r1, #4
+ moveq r5, #10
+ cmp r1, #5
+ moveq r5, #20
+ cmp r1, #6
+ moveq r5, #10
+ addeq r2, r3, #336
+ addeq r4, r3, #336
+ cmp r1, #7
+ moveq r5, #50
+ cmp r1, #8
+ moveq r5, #100
+ cmp r1, #9
+ moveq r5, #50
+ addeq r2, r3, #336
+ cmp r1, #10
+ moveq r5, #5
+ addeq r2, r3, #48
+ cmp r1, #11
+ moveq r5, #0
+ addeq r2, r3, #96
+ add r6, r3, #144
+ add r7, r3, #288
+ vld1.8 {d0-d1}, [r6, : 128]!
+ vld1.8 {d2-d3}, [r6, : 128]!
+ vld1.8 {d4}, [r6, : 64]
+ vst1.8 {d0-d1}, [r7, : 128]!
+ vst1.8 {d2-d3}, [r7, : 128]!
+ vst1.8 d4, [r7, : 64]
+ cmp r5, #0
+ beq .Lskipsquaringloop
+.Lsquaringloop:
+ add r6, r3, #288
+ add r7, r3, #288
+ add r8, r3, #288
+ vmov.i32 q0, #19
+ vmov.i32 q1, #0
+ vmov.i32 q2, #1
+ vzip.i32 q1, q2
+ vld1.8 {d4-d5}, [r7, : 128]!
+ vld1.8 {d6-d7}, [r7, : 128]!
+ vld1.8 {d9}, [r7, : 64]
+ vld1.8 {d10-d11}, [r6, : 128]!
+ add r7, sp, #384
+ vld1.8 {d12-d13}, [r6, : 128]!
+ vmul.i32 q7, q2, q0
+ vld1.8 {d8}, [r6, : 64]
+ vext.32 d17, d11, d10, #1
+ vmul.i32 q9, q3, q0
+ vext.32 d16, d10, d8, #1
+ vshl.u32 q10, q5, q1
+ vext.32 d22, d14, d4, #1
+ vext.32 d24, d18, d6, #1
+ vshl.u32 q13, q6, q1
+ vshl.u32 d28, d8, d2
+ vrev64.i32 d22, d22
+ vmul.i32 d1, d9, d1
+ vrev64.i32 d24, d24
+ vext.32 d29, d8, d13, #1
+ vext.32 d0, d1, d9, #1
+ vrev64.i32 d0, d0
+ vext.32 d2, d9, d1, #1
+ vext.32 d23, d15, d5, #1
+ vmull.s32 q4, d20, d4
+ vrev64.i32 d23, d23
+ vmlal.s32 q4, d21, d1
+ vrev64.i32 d2, d2
+ vmlal.s32 q4, d26, d19
+ vext.32 d3, d5, d15, #1
+ vmlal.s32 q4, d27, d18
+ vrev64.i32 d3, d3
+ vmlal.s32 q4, d28, d15
+ vext.32 d14, d12, d11, #1
+ vmull.s32 q5, d16, d23
+ vext.32 d15, d13, d12, #1
+ vmlal.s32 q5, d17, d4
+ vst1.8 d8, [r7, : 64]!
+ vmlal.s32 q5, d14, d1
+ vext.32 d12, d9, d8, #0
+ vmlal.s32 q5, d15, d19
+ vmov.i64 d13, #0
+ vmlal.s32 q5, d29, d18
+ vext.32 d25, d19, d7, #1
+ vmlal.s32 q6, d20, d5
+ vrev64.i32 d25, d25
+ vmlal.s32 q6, d21, d4
+ vst1.8 d11, [r7, : 64]!
+ vmlal.s32 q6, d26, d1
+ vext.32 d9, d10, d10, #0
+ vmlal.s32 q6, d27, d19
+ vmov.i64 d8, #0
+ vmlal.s32 q6, d28, d18
+ vmlal.s32 q4, d16, d24
+ vmlal.s32 q4, d17, d5
+ vmlal.s32 q4, d14, d4
+ vst1.8 d12, [r7, : 64]!
+ vmlal.s32 q4, d15, d1
+ vext.32 d10, d13, d12, #0
+ vmlal.s32 q4, d29, d19
+ vmov.i64 d11, #0
+ vmlal.s32 q5, d20, d6
+ vmlal.s32 q5, d21, d5
+ vmlal.s32 q5, d26, d4
+ vext.32 d13, d8, d8, #0
+ vmlal.s32 q5, d27, d1
+ vmov.i64 d12, #0
+ vmlal.s32 q5, d28, d19
+ vst1.8 d9, [r7, : 64]!
+ vmlal.s32 q6, d16, d25
+ vmlal.s32 q6, d17, d6
+ vst1.8 d10, [r7, : 64]
+ vmlal.s32 q6, d14, d5
+ vext.32 d8, d11, d10, #0
+ vmlal.s32 q6, d15, d4
+ vmov.i64 d9, #0
+ vmlal.s32 q6, d29, d1
+ vmlal.s32 q4, d20, d7
+ vmlal.s32 q4, d21, d6
+ vmlal.s32 q4, d26, d5
+ vext.32 d11, d12, d12, #0
+ vmlal.s32 q4, d27, d4
+ vmov.i64 d10, #0
+ vmlal.s32 q4, d28, d1
+ vmlal.s32 q5, d16, d0
+ sub r6, r7, #32
+ vmlal.s32 q5, d17, d7
+ vmlal.s32 q5, d14, d6
+ vext.32 d30, d9, d8, #0
+ vmlal.s32 q5, d15, d5
+ vld1.8 {d31}, [r6, : 64]!
+ vmlal.s32 q5, d29, d4
+ vmlal.s32 q15, d20, d0
+ vext.32 d0, d6, d18, #1
+ vmlal.s32 q15, d21, d25
+ vrev64.i32 d0, d0
+ vmlal.s32 q15, d26, d24
+ vext.32 d1, d7, d19, #1
+ vext.32 d7, d10, d10, #0
+ vmlal.s32 q15, d27, d23
+ vrev64.i32 d1, d1
+ vld1.8 {d6}, [r6, : 64]
+ vmlal.s32 q15, d28, d22
+ vmlal.s32 q3, d16, d4
+ add r6, r6, #24
+ vmlal.s32 q3, d17, d2
+ vext.32 d4, d31, d30, #0
+ vmov d17, d11
+ vmlal.s32 q3, d14, d1
+ vext.32 d11, d13, d13, #0
+ vext.32 d13, d30, d30, #0
+ vmlal.s32 q3, d15, d0
+ vext.32 d1, d8, d8, #0
+ vmlal.s32 q3, d29, d3
+ vld1.8 {d5}, [r6, : 64]
+ sub r6, r6, #16
+ vext.32 d10, d6, d6, #0
+ vmov.i32 q1, #0xffffffff
+ vshl.i64 q4, q1, #25
+ add r7, sp, #480
+ vld1.8 {d14-d15}, [r7, : 128]
+ vadd.i64 q9, q2, q7
+ vshl.i64 q1, q1, #26
+ vshr.s64 q10, q9, #26
+ vld1.8 {d0}, [r6, : 64]!
+ vadd.i64 q5, q5, q10
+ vand q9, q9, q1
+ vld1.8 {d16}, [r6, : 64]!
+ add r6, sp, #496
+ vld1.8 {d20-d21}, [r6, : 128]
+ vadd.i64 q11, q5, q10
+ vsub.i64 q2, q2, q9
+ vshr.s64 q9, q11, #25
+ vext.32 d12, d5, d4, #0
+ vand q11, q11, q4
+ vadd.i64 q0, q0, q9
+ vmov d19, d7
+ vadd.i64 q3, q0, q7
+ vsub.i64 q5, q5, q11
+ vshr.s64 q11, q3, #26
+ vext.32 d18, d11, d10, #0
+ vand q3, q3, q1
+ vadd.i64 q8, q8, q11
+ vadd.i64 q11, q8, q10
+ vsub.i64 q0, q0, q3
+ vshr.s64 q3, q11, #25
+ vand q11, q11, q4
+ vadd.i64 q3, q6, q3
+ vadd.i64 q6, q3, q7
+ vsub.i64 q8, q8, q11
+ vshr.s64 q11, q6, #26
+ vand q6, q6, q1
+ vadd.i64 q9, q9, q11
+ vadd.i64 d25, d19, d21
+ vsub.i64 q3, q3, q6
+ vshr.s64 d23, d25, #25
+ vand q4, q12, q4
+ vadd.i64 d21, d23, d23
+ vshl.i64 d25, d23, #4
+ vadd.i64 d21, d21, d23
+ vadd.i64 d25, d25, d21
+ vadd.i64 d4, d4, d25
+ vzip.i32 q0, q8
+ vadd.i64 d12, d4, d14
+ add r6, r8, #8
+ vst1.8 d0, [r6, : 64]
+ vsub.i64 d19, d19, d9
+ add r6, r6, #16
+ vst1.8 d16, [r6, : 64]
+ vshr.s64 d22, d12, #26
+ vand q0, q6, q1
+ vadd.i64 d10, d10, d22
+ vzip.i32 q3, q9
+ vsub.i64 d4, d4, d0
+ sub r6, r6, #8
+ vst1.8 d6, [r6, : 64]
+ add r6, r6, #16
+ vst1.8 d18, [r6, : 64]
+ vzip.i32 q2, q5
+ sub r6, r6, #32
+ vst1.8 d4, [r6, : 64]
+ subs r5, r5, #1
+ bhi .Lsquaringloop
+.Lskipsquaringloop:
+ mov r2, r2
+ add r5, r3, #288
+ add r6, r3, #144
+ vmov.i32 q0, #19
+ vmov.i32 q1, #0
+ vmov.i32 q2, #1
+ vzip.i32 q1, q2
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vld1.8 {d6-d7}, [r5, : 128]!
+ vld1.8 {d9}, [r5, : 64]
+ vld1.8 {d10-d11}, [r2, : 128]!
+ add r5, sp, #384
+ vld1.8 {d12-d13}, [r2, : 128]!
+ vmul.i32 q7, q2, q0
+ vld1.8 {d8}, [r2, : 64]
+ vext.32 d17, d11, d10, #1
+ vmul.i32 q9, q3, q0
+ vext.32 d16, d10, d8, #1
+ vshl.u32 q10, q5, q1
+ vext.32 d22, d14, d4, #1
+ vext.32 d24, d18, d6, #1
+ vshl.u32 q13, q6, q1
+ vshl.u32 d28, d8, d2
+ vrev64.i32 d22, d22
+ vmul.i32 d1, d9, d1
+ vrev64.i32 d24, d24
+ vext.32 d29, d8, d13, #1
+ vext.32 d0, d1, d9, #1
+ vrev64.i32 d0, d0
+ vext.32 d2, d9, d1, #1
+ vext.32 d23, d15, d5, #1
+ vmull.s32 q4, d20, d4
+ vrev64.i32 d23, d23
+ vmlal.s32 q4, d21, d1
+ vrev64.i32 d2, d2
+ vmlal.s32 q4, d26, d19
+ vext.32 d3, d5, d15, #1
+ vmlal.s32 q4, d27, d18
+ vrev64.i32 d3, d3
+ vmlal.s32 q4, d28, d15
+ vext.32 d14, d12, d11, #1
+ vmull.s32 q5, d16, d23
+ vext.32 d15, d13, d12, #1
+ vmlal.s32 q5, d17, d4
+ vst1.8 d8, [r5, : 64]!
+ vmlal.s32 q5, d14, d1
+ vext.32 d12, d9, d8, #0
+ vmlal.s32 q5, d15, d19
+ vmov.i64 d13, #0
+ vmlal.s32 q5, d29, d18
+ vext.32 d25, d19, d7, #1
+ vmlal.s32 q6, d20, d5
+ vrev64.i32 d25, d25
+ vmlal.s32 q6, d21, d4
+ vst1.8 d11, [r5, : 64]!
+ vmlal.s32 q6, d26, d1
+ vext.32 d9, d10, d10, #0
+ vmlal.s32 q6, d27, d19
+ vmov.i64 d8, #0
+ vmlal.s32 q6, d28, d18
+ vmlal.s32 q4, d16, d24
+ vmlal.s32 q4, d17, d5
+ vmlal.s32 q4, d14, d4
+ vst1.8 d12, [r5, : 64]!
+ vmlal.s32 q4, d15, d1
+ vext.32 d10, d13, d12, #0
+ vmlal.s32 q4, d29, d19
+ vmov.i64 d11, #0
+ vmlal.s32 q5, d20, d6
+ vmlal.s32 q5, d21, d5
+ vmlal.s32 q5, d26, d4
+ vext.32 d13, d8, d8, #0
+ vmlal.s32 q5, d27, d1
+ vmov.i64 d12, #0
+ vmlal.s32 q5, d28, d19
+ vst1.8 d9, [r5, : 64]!
+ vmlal.s32 q6, d16, d25
+ vmlal.s32 q6, d17, d6
+ vst1.8 d10, [r5, : 64]
+ vmlal.s32 q6, d14, d5
+ vext.32 d8, d11, d10, #0
+ vmlal.s32 q6, d15, d4
+ vmov.i64 d9, #0
+ vmlal.s32 q6, d29, d1
+ vmlal.s32 q4, d20, d7
+ vmlal.s32 q4, d21, d6
+ vmlal.s32 q4, d26, d5
+ vext.32 d11, d12, d12, #0
+ vmlal.s32 q4, d27, d4
+ vmov.i64 d10, #0
+ vmlal.s32 q4, d28, d1
+ vmlal.s32 q5, d16, d0
+ sub r2, r5, #32
+ vmlal.s32 q5, d17, d7
+ vmlal.s32 q5, d14, d6
+ vext.32 d30, d9, d8, #0
+ vmlal.s32 q5, d15, d5
+ vld1.8 {d31}, [r2, : 64]!
+ vmlal.s32 q5, d29, d4
+ vmlal.s32 q15, d20, d0
+ vext.32 d0, d6, d18, #1
+ vmlal.s32 q15, d21, d25
+ vrev64.i32 d0, d0
+ vmlal.s32 q15, d26, d24
+ vext.32 d1, d7, d19, #1
+ vext.32 d7, d10, d10, #0
+ vmlal.s32 q15, d27, d23
+ vrev64.i32 d1, d1
+ vld1.8 {d6}, [r2, : 64]
+ vmlal.s32 q15, d28, d22
+ vmlal.s32 q3, d16, d4
+ add r2, r2, #24
+ vmlal.s32 q3, d17, d2
+ vext.32 d4, d31, d30, #0
+ vmov d17, d11
+ vmlal.s32 q3, d14, d1
+ vext.32 d11, d13, d13, #0
+ vext.32 d13, d30, d30, #0
+ vmlal.s32 q3, d15, d0
+ vext.32 d1, d8, d8, #0
+ vmlal.s32 q3, d29, d3
+ vld1.8 {d5}, [r2, : 64]
+ sub r2, r2, #16
+ vext.32 d10, d6, d6, #0
+ vmov.i32 q1, #0xffffffff
+ vshl.i64 q4, q1, #25
+ add r5, sp, #480
+ vld1.8 {d14-d15}, [r5, : 128]
+ vadd.i64 q9, q2, q7
+ vshl.i64 q1, q1, #26
+ vshr.s64 q10, q9, #26
+ vld1.8 {d0}, [r2, : 64]!
+ vadd.i64 q5, q5, q10
+ vand q9, q9, q1
+ vld1.8 {d16}, [r2, : 64]!
+ add r2, sp, #496
+ vld1.8 {d20-d21}, [r2, : 128]
+ vadd.i64 q11, q5, q10
+ vsub.i64 q2, q2, q9
+ vshr.s64 q9, q11, #25
+ vext.32 d12, d5, d4, #0
+ vand q11, q11, q4
+ vadd.i64 q0, q0, q9
+ vmov d19, d7
+ vadd.i64 q3, q0, q7
+ vsub.i64 q5, q5, q11
+ vshr.s64 q11, q3, #26
+ vext.32 d18, d11, d10, #0
+ vand q3, q3, q1
+ vadd.i64 q8, q8, q11
+ vadd.i64 q11, q8, q10
+ vsub.i64 q0, q0, q3
+ vshr.s64 q3, q11, #25
+ vand q11, q11, q4
+ vadd.i64 q3, q6, q3
+ vadd.i64 q6, q3, q7
+ vsub.i64 q8, q8, q11
+ vshr.s64 q11, q6, #26
+ vand q6, q6, q1
+ vadd.i64 q9, q9, q11
+ vadd.i64 d25, d19, d21
+ vsub.i64 q3, q3, q6
+ vshr.s64 d23, d25, #25
+ vand q4, q12, q4
+ vadd.i64 d21, d23, d23
+ vshl.i64 d25, d23, #4
+ vadd.i64 d21, d21, d23
+ vadd.i64 d25, d25, d21
+ vadd.i64 d4, d4, d25
+ vzip.i32 q0, q8
+ vadd.i64 d12, d4, d14
+ add r2, r6, #8
+ vst1.8 d0, [r2, : 64]
+ vsub.i64 d19, d19, d9
+ add r2, r2, #16
+ vst1.8 d16, [r2, : 64]
+ vshr.s64 d22, d12, #26
+ vand q0, q6, q1
+ vadd.i64 d10, d10, d22
+ vzip.i32 q3, q9
+ vsub.i64 d4, d4, d0
+ sub r2, r2, #8
+ vst1.8 d6, [r2, : 64]
+ add r2, r2, #16
+ vst1.8 d18, [r2, : 64]
+ vzip.i32 q2, q5
+ sub r2, r2, #32
+ vst1.8 d4, [r2, : 64]
+ cmp r4, #0
+ beq .Lskippostcopy
+ add r2, r3, #144
+ mov r4, r4
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r4, : 128]!
+ vst1.8 {d2-d3}, [r4, : 128]!
+ vst1.8 d4, [r4, : 64]
+.Lskippostcopy:
+ cmp r1, #1
+ bne .Lskipfinalcopy
+ add r2, r3, #288
+ add r4, r3, #144
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r4, : 128]!
+ vst1.8 {d2-d3}, [r4, : 128]!
+ vst1.8 d4, [r4, : 64]
+.Lskipfinalcopy:
+ add r1, r1, #1
+ cmp r1, #12
+ blo .Linvertloop
+ add r1, r3, #144
+ ldr r2, [r1], #4
+ ldr r3, [r1], #4
+ ldr r4, [r1], #4
+ ldr r5, [r1], #4
+ ldr r6, [r1], #4
+ ldr r7, [r1], #4
+ ldr r8, [r1], #4
+ ldr r9, [r1], #4
+ ldr r10, [r1], #4
+ ldr r1, [r1]
+ add r11, r1, r1, LSL #4
+ add r11, r11, r1, LSL #1
+ add r11, r11, #16777216
+ mov r11, r11, ASR #25
+ add r11, r11, r2
+ mov r11, r11, ASR #26
+ add r11, r11, r3
+ mov r11, r11, ASR #25
+ add r11, r11, r4
+ mov r11, r11, ASR #26
+ add r11, r11, r5
+ mov r11, r11, ASR #25
+ add r11, r11, r6
+ mov r11, r11, ASR #26
+ add r11, r11, r7
+ mov r11, r11, ASR #25
+ add r11, r11, r8
+ mov r11, r11, ASR #26
+ add r11, r11, r9
+ mov r11, r11, ASR #25
+ add r11, r11, r10
+ mov r11, r11, ASR #26
+ add r11, r11, r1
+ mov r11, r11, ASR #25
+ add r2, r2, r11
+ add r2, r2, r11, LSL #1
+ add r2, r2, r11, LSL #4
+ mov r11, r2, ASR #26
+ add r3, r3, r11
+ sub r2, r2, r11, LSL #26
+ mov r11, r3, ASR #25
+ add r4, r4, r11
+ sub r3, r3, r11, LSL #25
+ mov r11, r4, ASR #26
+ add r5, r5, r11
+ sub r4, r4, r11, LSL #26
+ mov r11, r5, ASR #25
+ add r6, r6, r11
+ sub r5, r5, r11, LSL #25
+ mov r11, r6, ASR #26
+ add r7, r7, r11
+ sub r6, r6, r11, LSL #26
+ mov r11, r7, ASR #25
+ add r8, r8, r11
+ sub r7, r7, r11, LSL #25
+ mov r11, r8, ASR #26
+ add r9, r9, r11
+ sub r8, r8, r11, LSL #26
+ mov r11, r9, ASR #25
+ add r10, r10, r11
+ sub r9, r9, r11, LSL #25
+ mov r11, r10, ASR #26
+ add r1, r1, r11
+ sub r10, r10, r11, LSL #26
+ mov r11, r1, ASR #25
+ sub r1, r1, r11, LSL #25
+ add r2, r2, r3, LSL #26
+ mov r3, r3, LSR #6
+ add r3, r3, r4, LSL #19
+ mov r4, r4, LSR #13
+ add r4, r4, r5, LSL #13
+ mov r5, r5, LSR #19
+ add r5, r5, r6, LSL #6
+ add r6, r7, r8, LSL #25
+ mov r7, r8, LSR #7
+ add r7, r7, r9, LSL #19
+ mov r8, r9, LSR #13
+ add r8, r8, r10, LSL #12
+ mov r9, r10, LSR #20
+ add r1, r9, r1, LSL #6
+ str r2, [r0]
+ str r3, [r0, #4]
+ str r4, [r0, #8]
+ str r5, [r0, #12]
+ str r6, [r0, #16]
+ str r7, [r0, #20]
+ str r8, [r0, #24]
+ str r1, [r0, #28]
+ movw r0, #0
+ mov sp, ip
+ pop {r4-r11, pc}
+ENDPROC(curve25519_neon)
diff --git a/lib/crypto/arm/curve25519.h b/lib/crypto/arm/curve25519.h
new file mode 100644
index 000000000000..b1a566885e95
--- /dev/null
+++ b/lib/crypto/arm/curve25519.h
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
+ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
+ * manually reworked for use in kernel space.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <linux/types.h>
+#include <linux/jump_label.h>
+
+asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
+ const u8 scalar[CURVE25519_KEY_SIZE],
+ const u8 point[CURVE25519_KEY_SIZE])
+{
+ if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
+ scoped_ksimd()
+ curve25519_neon(out, scalar, point);
+ } else {
+ curve25519_generic(out, scalar, point);
+ }
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ curve25519_arch(pub, secret, curve25519_base_point);
+}
+
+#define curve25519_mod_init_arch curve25519_mod_init_arch
+static void curve25519_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON)
+ static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl
new file mode 100644
index 000000000000..34c11b7b44bd
--- /dev/null
+++ b/lib/crypto/arm/poly1305-armv4.pl
@@ -0,0 +1,1235 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# IALU(*)/gcc-4.4 NEON
+#
+# ARM11xx(ARMv6) 7.78/+100% -
+# Cortex-A5 6.35/+130% 3.00
+# Cortex-A8 6.25/+115% 2.36
+# Cortex-A9 5.10/+95% 2.55
+# Cortex-A15 3.85/+85% 1.25(**)
+# Snapdragon S4 5.70/+100% 1.48(**)
+#
+# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
+# (**) these are trade-off results, they can be improved by ~8% but at
+# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
+# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+# define poly1305_init poly1305_block_init
+# define poly1305_blocks poly1305_blocks_arm
+#endif
+
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.text
+
+.globl poly1305_emit
+.globl poly1305_blocks
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+.Lpoly1305_init:
+ stmdb sp!,{r4-r11}
+
+ eor r3,r3,r3
+ cmp $inp,#0
+ str r3,[$ctx,#0] @ zero hash value
+ str r3,[$ctx,#4]
+ str r3,[$ctx,#8]
+ str r3,[$ctx,#12]
+ str r3,[$ctx,#16]
+ str r3,[$ctx,#36] @ clear is_base2_26
+ add $ctx,$ctx,#20
+
+#ifdef __thumb2__
+ it eq
+#endif
+ moveq r0,#0
+ beq .Lno_key
+
+#if __ARM_MAX_ARCH__>=7
+ mov r3,#-1
+ str r3,[$ctx,#28] @ impossible key power value
+# ifndef __KERNEL__
+ adr r11,.Lpoly1305_init
+ ldr r12,.LOPENSSL_armcap
+# endif
+#endif
+ ldrb r4,[$inp,#0]
+ mov r10,#0x0fffffff
+ ldrb r5,[$inp,#1]
+ and r3,r10,#-4 @ 0x0ffffffc
+ ldrb r6,[$inp,#2]
+ ldrb r7,[$inp,#3]
+ orr r4,r4,r5,lsl#8
+ ldrb r5,[$inp,#4]
+ orr r4,r4,r6,lsl#16
+ ldrb r6,[$inp,#5]
+ orr r4,r4,r7,lsl#24
+ ldrb r7,[$inp,#6]
+ and r4,r4,r10
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+# if !defined(_WIN32)
+ ldr r12,[r11,r12] @ OPENSSL_armcap_P
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
+ ldr r12,[r12]
+# endif
+#endif
+ ldrb r8,[$inp,#7]
+ orr r5,r5,r6,lsl#8
+ ldrb r6,[$inp,#8]
+ orr r5,r5,r7,lsl#16
+ ldrb r7,[$inp,#9]
+ orr r5,r5,r8,lsl#24
+ ldrb r8,[$inp,#10]
+ and r5,r5,r3
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ tst r12,#ARMV7_NEON @ check for NEON
+# ifdef __thumb2__
+ adr r9,.Lpoly1305_blocks_neon
+ adr r11,.Lpoly1305_blocks
+ it ne
+ movne r11,r9
+ adr r12,.Lpoly1305_emit
+ orr r11,r11,#1 @ thumb-ify addresses
+ orr r12,r12,#1
+# else
+ add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+ ite eq
+ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
+# endif
+#endif
+ ldrb r9,[$inp,#11]
+ orr r6,r6,r7,lsl#8
+ ldrb r7,[$inp,#12]
+ orr r6,r6,r8,lsl#16
+ ldrb r8,[$inp,#13]
+ orr r6,r6,r9,lsl#24
+ ldrb r9,[$inp,#14]
+ and r6,r6,r3
+
+ ldrb r10,[$inp,#15]
+ orr r7,r7,r8,lsl#8
+ str r4,[$ctx,#0]
+ orr r7,r7,r9,lsl#16
+ str r5,[$ctx,#4]
+ orr r7,r7,r10,lsl#24
+ str r6,[$ctx,#8]
+ and r7,r7,r3
+ str r7,[$ctx,#12]
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ stmia r2,{r11,r12} @ fill functions table
+ mov r0,#1
+#else
+ mov r0,#0
+#endif
+.Lno_key:
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
+my ($s1,$s2,$s3)=($r1,$r2,$r3);
+
+$code.=<<___;
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ stmdb sp!,{r3-r11,lr}
+
+ ands $len,$len,#-16
+ beq .Lno_data
+
+ add $len,$len,$inp @ end pointer
+ sub sp,sp,#32
+
+#if __ARM_ARCH__<7
+ ldmia $ctx,{$h0-$r3} @ load context
+ add $ctx,$ctx,#20
+ str $len,[sp,#16] @ offload stuff
+ str $ctx,[sp,#12]
+#else
+ ldr lr,[$ctx,#36] @ is_base2_26
+ ldmia $ctx!,{$h0-$h4} @ load hash value
+ str $len,[sp,#16] @ offload stuff
+ str $ctx,[sp,#12]
+
+ adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
+ mov $r1,$h1,lsr#6
+ adcs $r1,$r1,$h2,lsl#20
+ mov $r2,$h2,lsr#12
+ adcs $r2,$r2,$h3,lsl#14
+ mov $r3,$h3,lsr#18
+ adcs $r3,$r3,$h4,lsl#8
+ mov $len,#0
+ teq lr,#0
+ str $len,[$ctx,#16] @ clear is_base2_26
+ adc $len,$len,$h4,lsr#24
+
+ itttt ne
+ movne $h0,$r0 @ choose between radixes
+ movne $h1,$r1
+ movne $h2,$r2
+ movne $h3,$r3
+ ldmia $ctx,{$r0-$r3} @ load key
+ it ne
+ movne $h4,$len
+#endif
+
+ mov lr,$inp
+ cmp $padbit,#0
+ str $r1,[sp,#20]
+ str $r2,[sp,#24]
+ str $r3,[sp,#28]
+ b .Loop
+
+.align 4
+.Loop:
+#if __ARM_ARCH__<7
+ ldrb r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi $h4,$h4,#1 @ 1<<128
+ ldrb r1,[lr,#-15]
+ ldrb r2,[lr,#-14]
+ ldrb r3,[lr,#-13]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-12]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-11]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-10]
+ adds $h0,$h0,r3 @ accumulate input
+
+ ldrb r3,[lr,#-9]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-8]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-7]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-6]
+ adcs $h1,$h1,r3
+
+ ldrb r3,[lr,#-5]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-4]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-3]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-2]
+ adcs $h2,$h2,r3
+
+ ldrb r3,[lr,#-1]
+ orr r1,r0,r1,lsl#8
+ str lr,[sp,#8] @ offload input pointer
+ orr r2,r1,r2,lsl#16
+ add $s1,$r1,$r1,lsr#2
+ orr r3,r2,r3,lsl#24
+#else
+ ldr r0,[lr],#16 @ load input
+ it hi
+ addhi $h4,$h4,#1 @ padbit
+ ldr r1,[lr,#-12]
+ ldr r2,[lr,#-8]
+ ldr r3,[lr,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ adds $h0,$h0,r0 @ accumulate input
+ str lr,[sp,#8] @ offload input pointer
+ adcs $h1,$h1,r1
+ add $s1,$r1,$r1,lsr#2
+ adcs $h2,$h2,r2
+#endif
+ add $s2,$r2,$r2,lsr#2
+ adcs $h3,$h3,r3
+ add $s3,$r3,$r3,lsr#2
+
+ umull r2,r3,$h1,$r0
+ adc $h4,$h4,#0
+ umull r0,r1,$h0,$r0
+ umlal r2,r3,$h4,$s1
+ umlal r0,r1,$h3,$s1
+ ldr $r1,[sp,#20] @ reload $r1
+ umlal r2,r3,$h2,$s3
+ umlal r0,r1,$h1,$s3
+ umlal r2,r3,$h3,$s2
+ umlal r0,r1,$h2,$s2
+ umlal r2,r3,$h0,$r1
+ str r0,[sp,#0] @ future $h0
+ mul r0,$s2,$h4
+ ldr $r2,[sp,#24] @ reload $r2
+ adds r2,r2,r1 @ d1+=d0>>32
+ eor r1,r1,r1
+ adc lr,r3,#0 @ future $h2
+ str r2,[sp,#4] @ future $h1
+
+ mul r2,$s3,$h4
+ eor r3,r3,r3
+ umlal r0,r1,$h3,$s3
+ ldr $r3,[sp,#28] @ reload $r3
+ umlal r2,r3,$h3,$r0
+ umlal r0,r1,$h2,$r0
+ umlal r2,r3,$h2,$r1
+ umlal r0,r1,$h1,$r1
+ umlal r2,r3,$h1,$r2
+ umlal r0,r1,$h0,$r2
+ umlal r2,r3,$h0,$r3
+ ldr $h0,[sp,#0]
+ mul $h4,$r0,$h4
+ ldr $h1,[sp,#4]
+
+ adds $h2,lr,r0 @ d2+=d1>>32
+ ldr lr,[sp,#8] @ reload input pointer
+ adc r1,r1,#0
+ adds $h3,r2,r1 @ d3+=d2>>32
+ ldr r0,[sp,#16] @ reload end pointer
+ adc r3,r3,#0
+ add $h4,$h4,r3 @ h4+=d3>>32
+
+ and r1,$h4,#-4
+ and $h4,$h4,#3
+ add r1,r1,r1,lsr#2 @ *=5
+ adds $h0,$h0,r1
+ adcs $h1,$h1,#0
+ adcs $h2,$h2,#0
+ adcs $h3,$h3,#0
+ adc $h4,$h4,#0
+
+ cmp r0,lr @ done yet?
+ bhi .Loop
+
+ ldr $ctx,[sp,#12]
+ add sp,sp,#32
+ stmdb $ctx,{$h0-$h4} @ store the result
+
+.Lno_data:
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r3-r11,pc}
+#else
+ ldmia sp!,{r3-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
+my $g4=$ctx;
+
+$code.=<<___;
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+ stmdb sp!,{r4-r11}
+
+ ldmia $ctx,{$h0-$h4}
+
+#if __ARM_ARCH__>=7
+ ldr ip,[$ctx,#36] @ is_base2_26
+
+ adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
+ mov $g1,$h1,lsr#6
+ adcs $g1,$g1,$h2,lsl#20
+ mov $g2,$h2,lsr#12
+ adcs $g2,$g2,$h3,lsl#14
+ mov $g3,$h3,lsr#18
+ adcs $g3,$g3,$h4,lsl#8
+ mov $g4,#0
+ adc $g4,$g4,$h4,lsr#24
+
+ tst ip,ip
+ itttt ne
+ movne $h0,$g0
+ movne $h1,$g1
+ movne $h2,$g2
+ movne $h3,$g3
+ it ne
+ movne $h4,$g4
+#endif
+
+ adds $g0,$h0,#5 @ compare to modulus
+ adcs $g1,$h1,#0
+ adcs $g2,$h2,#0
+ adcs $g3,$h3,#0
+ adc $g4,$h4,#0
+ tst $g4,#4 @ did it carry/borrow?
+
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h0,$g0
+ ldr $g0,[$nonce,#0]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h1,$g1
+ ldr $g1,[$nonce,#4]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h2,$g2
+ ldr $g2,[$nonce,#8]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h3,$g3
+ ldr $g3,[$nonce,#12]
+
+ adds $h0,$h0,$g0
+ adcs $h1,$h1,$g1
+ adcs $h2,$h2,$g2
+ adc $h3,$h3,$g3
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+ rev $h0,$h0
+ rev $h1,$h1
+ rev $h2,$h2
+ rev $h3,$h3
+# endif
+ str $h0,[$mac,#0]
+ str $h1,[$mac,#4]
+ str $h2,[$mac,#8]
+ str $h3,[$mac,#12]
+#else
+ strb $h0,[$mac,#0]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#4]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#8]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#12]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#1]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#5]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#9]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#13]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#2]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#6]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#10]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#14]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#3]
+ strb $h1,[$mac,#7]
+ strb $h2,[$mac,#11]
+ strb $h3,[$mac,#15]
+#endif
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_emit,.-poly1305_emit
+___
+{
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
+
+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.fpu neon
+
+.type poly1305_init_neon,%function
+.align 5
+poly1305_init_neon:
+.Lpoly1305_init_neon:
+ ldr r3,[$ctx,#48] @ first table element
+ cmp r3,#-1 @ is value impossible?
+ bne .Lno_init_neon
+
+ ldr r4,[$ctx,#20] @ load key base 2^32
+ ldr r5,[$ctx,#24]
+ ldr r6,[$ctx,#28]
+ ldr r7,[$ctx,#32]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ and r3,r3,#0x03ffffff
+ and r4,r4,#0x03ffffff
+ and r5,r5,#0x03ffffff
+
+ vdup.32 $R0,r2 @ r^1 in both lanes
+ add r2,r3,r3,lsl#2 @ *5
+ vdup.32 $R1,r3
+ add r3,r4,r4,lsl#2
+ vdup.32 $S1,r2
+ vdup.32 $R2,r4
+ add r4,r5,r5,lsl#2
+ vdup.32 $S2,r3
+ vdup.32 $R3,r5
+ add r5,r6,r6,lsl#2
+ vdup.32 $S3,r4
+ vdup.32 $R4,r6
+ vdup.32 $S4,r5
+
+ mov $zeros,#2 @ counter
+
+.Lsquare_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+
+ vmull.u32 $D0,$R0,${R0}[1]
+ vmull.u32 $D1,$R1,${R0}[1]
+ vmull.u32 $D2,$R2,${R0}[1]
+ vmull.u32 $D3,$R3,${R0}[1]
+ vmull.u32 $D4,$R4,${R0}[1]
+
+ vmlal.u32 $D0,$R4,${S1}[1]
+ vmlal.u32 $D1,$R0,${R1}[1]
+ vmlal.u32 $D2,$R1,${R1}[1]
+ vmlal.u32 $D3,$R2,${R1}[1]
+ vmlal.u32 $D4,$R3,${R1}[1]
+
+ vmlal.u32 $D0,$R3,${S2}[1]
+ vmlal.u32 $D1,$R4,${S2}[1]
+ vmlal.u32 $D3,$R1,${R2}[1]
+ vmlal.u32 $D2,$R0,${R2}[1]
+ vmlal.u32 $D4,$R2,${R2}[1]
+
+ vmlal.u32 $D0,$R2,${S3}[1]
+ vmlal.u32 $D3,$R0,${R3}[1]
+ vmlal.u32 $D1,$R3,${S3}[1]
+ vmlal.u32 $D2,$R4,${S3}[1]
+ vmlal.u32 $D4,$R1,${R3}[1]
+
+ vmlal.u32 $D3,$R4,${S4}[1]
+ vmlal.u32 $D0,$R1,${S4}[1]
+ vmlal.u32 $D1,$R2,${S4}[1]
+ vmlal.u32 $D2,$R3,${S4}[1]
+ vmlal.u32 $D4,$R0,${R4}[1]
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ @ and P. Schwabe
+ @
+ @ H0>>+H1>>+H2>>+H3>>+H4
+ @ H3>>+H4>>*5+H0>>+H1
+ @
+ @ Trivia.
+ @
+ @ Result of multiplication of n-bit number by m-bit number is
+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+ @ m-bit number multiplied by 2^n is still n+m bits wide.
+ @
+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+ @ one is n+1 bits wide.
+ @
+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+ @ can be 27. However! In cases when their width exceeds 26 bits
+ @ they are limited by 2^26+2^6. This in turn means that *sum*
+ @ of the products with these values can still be viewed as sum
+ @ of 52-bit numbers as long as the amount of addends is not a
+ @ power of 2. For example,
+ @
+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+ @
+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+ @ which is less than 32 * (2^52) or 2^57. And when processing
+ @ data we are looking at triple as many addends...
+ @
+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+ @ This means that result of reduction have to be compressed upon
+ @ loop wrap-around. This can be done in the process of reduction
+ @ to minimize amount of instructions [as well as amount of
+ @ 128-bit instructions, which benefits low-end processors], but
+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
+ @ not being wider than 58 bits, so that result of right shift
+ @ by 26 bits fits in 32 bits. This is also useful on x86,
+ @ because it allows to use paddd in place for paddq, which
+ @ benefits Atom, where paddq is ridiculously slow.
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vbic.i32 $D4#lo,#0xfc000000
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vbic.i32 $D2#lo,#0xfc000000
+
+ vshr.u32 $T0#lo,$D0#lo,#26
+ vbic.i32 $D0#lo,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+
+ subs $zeros,$zeros,#1
+ beq .Lsquare_break_neon
+
+ add $tbl0,$ctx,#(48+0*9*4)
+ add $tbl1,$ctx,#(48+1*9*4)
+
+ vtrn.32 $R0,$D0#lo @ r^2:r^1
+ vtrn.32 $R2,$D2#lo
+ vtrn.32 $R3,$D3#lo
+ vtrn.32 $R1,$D1#lo
+ vtrn.32 $R4,$D4#lo
+
+ vshl.u32 $S2,$R2,#2 @ *5
+ vshl.u32 $S3,$R3,#2
+ vshl.u32 $S1,$R1,#2
+ vshl.u32 $S4,$R4,#2
+ vadd.i32 $S2,$S2,$R2
+ vadd.i32 $S1,$S1,$R1
+ vadd.i32 $S3,$S3,$R3
+ vadd.i32 $S4,$S4,$R4
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0,:32]
+ vst1.32 {${S4}[1]},[$tbl1,:32]
+
+ b .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+ add $tbl0,$ctx,#(48+2*4*9)
+ add $tbl1,$ctx,#(48+3*4*9)
+
+ vmov $R0,$D0#lo @ r^4:r^3
+ vshl.u32 $S1,$D1#lo,#2 @ *5
+ vmov $R1,$D1#lo
+ vshl.u32 $S2,$D2#lo,#2
+ vmov $R2,$D2#lo
+ vshl.u32 $S3,$D3#lo,#2
+ vmov $R3,$D3#lo
+ vshl.u32 $S4,$D4#lo,#2
+ vmov $R4,$D4#lo
+ vadd.i32 $S1,$S1,$D1#lo
+ vadd.i32 $S2,$S2,$D2#lo
+ vadd.i32 $S3,$S3,$D3#lo
+ vadd.i32 $S4,$S4,$D4#lo
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0]
+ vst1.32 {${S4}[1]},[$tbl1]
+
+.Lno_init_neon:
+ ret @ bx lr
+.size poly1305_init_neon,.-poly1305_init_neon
+
+.globl poly1305_blocks_neon
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+ ldr ip,[$ctx,#36] @ is_base2_26
+
+ cmp $len,#64
+ blo .Lpoly1305_blocks
+
+ stmdb sp!,{r4-r7}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ tst ip,ip @ is_base2_26?
+ bne .Lbase2_26_neon
+
+ stmdb sp!,{r1-r3,lr}
+ bl .Lpoly1305_init_neon
+
+ ldr r4,[$ctx,#0] @ load hash value base 2^32
+ ldr r5,[$ctx,#4]
+ ldr r6,[$ctx,#8]
+ ldr r7,[$ctx,#12]
+ ldr ip,[$ctx,#16]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ veor $D0#lo,$D0#lo,$D0#lo
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ veor $D1#lo,$D1#lo,$D1#lo
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ veor $D2#lo,$D2#lo,$D2#lo
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ veor $D3#lo,$D3#lo,$D3#lo
+ and r3,r3,#0x03ffffff
+ orr r6,r6,ip,lsl#24
+ veor $D4#lo,$D4#lo,$D4#lo
+ and r4,r4,#0x03ffffff
+ mov r1,#1
+ and r5,r5,#0x03ffffff
+ str r1,[$ctx,#36] @ set is_base2_26
+
+ vmov.32 $D0#lo[0],r2
+ vmov.32 $D1#lo[0],r3
+ vmov.32 $D2#lo[0],r4
+ vmov.32 $D3#lo[0],r5
+ vmov.32 $D4#lo[0],r6
+ adr $zeros,.Lzeros
+
+ ldmia sp!,{r1-r3,lr}
+ b .Lhash_loaded
+
+.align 4
+.Lbase2_26_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ load hash value
+
+ veor $D0#lo,$D0#lo,$D0#lo
+ veor $D1#lo,$D1#lo,$D1#lo
+ veor $D2#lo,$D2#lo,$D2#lo
+ veor $D3#lo,$D3#lo,$D3#lo
+ veor $D4#lo,$D4#lo,$D4#lo
+ vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ adr $zeros,.Lzeros
+ vld1.32 {$D4#lo[0]},[$ctx]
+ sub $ctx,$ctx,#16 @ rewind
+
+.Lhash_loaded:
+ add $in2,$inp,#32
+ mov $padbit,$padbit,lsl#24
+ tst $len,#31
+ beq .Leven
+
+ vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
+ vmov.32 $H4#lo[0],$padbit
+ sub $len,$len,#16
+ add $in2,$inp,#32
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3#lo,$H3#lo,#18
+
+ vsri.u32 $H3#lo,$H2#lo,#14
+ vshl.u32 $H2#lo,$H2#lo,#12
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
+
+ vbic.i32 $H3#lo,#0xfc000000
+ vsri.u32 $H2#lo,$H1#lo,#20
+ vshl.u32 $H1#lo,$H1#lo,#6
+
+ vbic.i32 $H2#lo,#0xfc000000
+ vsri.u32 $H1#lo,$H0#lo,#26
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+
+ vbic.i32 $H0#lo,#0xfc000000
+ vbic.i32 $H1#lo,#0xfc000000
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo
+
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+
+ mov $tbl1,$zeros
+ add $tbl0,$ctx,#48
+
+ cmp $len,$len
+ b .Long_tail
+
+.align 4
+.Leven:
+ subs $len,$len,#64
+ it lo
+ movlo $in2,$zeros
+
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+ itt hi
+ addhi $tbl1,$ctx,#(48+1*9*4)
+ addhi $tbl0,$ctx,#(48+3*9*4)
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3,$H3,#18
+
+ vsri.u32 $H3,$H2,#14
+ vshl.u32 $H2,$H2,#12
+
+ vbic.i32 $H3,#0xfc000000
+ vsri.u32 $H2,$H1,#20
+ vshl.u32 $H1,$H1,#6
+
+ vbic.i32 $H2,#0xfc000000
+ vsri.u32 $H1,$H0,#26
+
+ vbic.i32 $H0,#0xfc000000
+ vbic.i32 $H1,#0xfc000000
+
+ bls .Lskip_loop
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ b .Loop_neon
+
+.align 5
+.Loop_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ @ \___________________/
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ @ \___________________/ \____________________/
+ @
+ @ Note that we start with inp[2:3]*r^2. This is because it
+ @ doesn't depend on reduction in previous iteration.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ inp[2:3]*r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
+ vmull.u32 $D2,$H2#hi,${R0}[1]
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,${R0}[1]
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,${R0}[1]
+ vmlal.u32 $D2,$H1#hi,${R1}[1]
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,${R0}[1]
+
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,${R0}[1]
+ subs $len,$len,#64
+ vmlal.u32 $D0,$H4#hi,${S1}[1]
+ it lo
+ movlo $in2,$zeros
+ vmlal.u32 $D3,$H2#hi,${R1}[1]
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D1,$H0#hi,${R1}[1]
+ vmlal.u32 $D4,$H3#hi,${R1}[1]
+
+ vmlal.u32 $D0,$H3#hi,${S2}[1]
+ vmlal.u32 $D3,$H1#hi,${R2}[1]
+ vmlal.u32 $D4,$H2#hi,${R2}[1]
+ vmlal.u32 $D1,$H4#hi,${S2}[1]
+ vmlal.u32 $D2,$H0#hi,${R2}[1]
+
+ vmlal.u32 $D3,$H0#hi,${R3}[1]
+ vmlal.u32 $D0,$H2#hi,${S3}[1]
+ vmlal.u32 $D4,$H1#hi,${R3}[1]
+ vmlal.u32 $D1,$H3#hi,${S3}[1]
+ vmlal.u32 $D2,$H4#hi,${S3}[1]
+
+ vmlal.u32 $D3,$H4#hi,${S4}[1]
+ vmlal.u32 $D0,$H1#hi,${S4}[1]
+ vmlal.u32 $D4,$H0#hi,${R4}[1]
+ vmlal.u32 $D1,$H2#hi,${S4}[1]
+ vmlal.u32 $D2,$H3#hi,${S4}[1]
+
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4 and accumulate
+
+ vmlal.u32 $D3,$H3#lo,${R0}[0]
+ vmlal.u32 $D0,$H0#lo,${R0}[0]
+ vmlal.u32 $D4,$H4#lo,${R0}[0]
+ vmlal.u32 $D1,$H1#lo,${R0}[0]
+ vmlal.u32 $D2,$H2#lo,${R0}[0]
+ vld1.32 ${S4}[0],[$tbl0,:32]
+
+ vmlal.u32 $D3,$H2#lo,${R1}[0]
+ vmlal.u32 $D0,$H4#lo,${S1}[0]
+ vmlal.u32 $D4,$H3#lo,${R1}[0]
+ vmlal.u32 $D1,$H0#lo,${R1}[0]
+ vmlal.u32 $D2,$H1#lo,${R1}[0]
+
+ vmlal.u32 $D3,$H1#lo,${R2}[0]
+ vmlal.u32 $D0,$H3#lo,${S2}[0]
+ vmlal.u32 $D4,$H2#lo,${R2}[0]
+ vmlal.u32 $D1,$H4#lo,${S2}[0]
+ vmlal.u32 $D2,$H0#lo,${R2}[0]
+
+ vmlal.u32 $D3,$H0#lo,${R3}[0]
+ vmlal.u32 $D0,$H2#lo,${S3}[0]
+ vmlal.u32 $D4,$H1#lo,${R3}[0]
+ vmlal.u32 $D1,$H3#lo,${S3}[0]
+ vmlal.u32 $D3,$H4#lo,${S4}[0]
+
+ vmlal.u32 $D2,$H4#lo,${S3}[0]
+ vmlal.u32 $D0,$H1#lo,${S4}[0]
+ vmlal.u32 $D4,$H0#lo,${R4}[0]
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vmlal.u32 $D1,$H2#lo,${S4}[0]
+ vmlal.u32 $D2,$H3#lo,${S4}[0]
+
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+ vrev32.8 $H3,$H3
+# endif
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+ @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vshl.u32 $H3,$H3,#18
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vsri.u32 $H3,$H2,#14
+ vbic.i32 $D4#lo,#0xfc000000
+ vshl.u32 $H2,$H2,#12
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vbic.i32 $H3,#0xfc000000
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
+ vsri.u32 $H2,$H1,#20
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vshl.u32 $H1,$H1,#6
+ vbic.i32 $D2#lo,#0xfc000000
+ vbic.i32 $H2,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
+ vmovn.i64 $D0#lo,$D0
+ vsri.u32 $H1,$H0,#26
+ vbic.i32 $H0,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vbic.i32 $D0#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+ vbic.i32 $H1,#0xfc000000
+
+ bhi .Loop_neon
+
+.Lskip_loop:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ add $tbl1,$ctx,#(48+0*9*4)
+ add $tbl0,$ctx,#(48+1*9*4)
+ adds $len,$len,#32
+ it ne
+ movne $len,#0
+ bne .Long_tail
+
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo
+
+.Long_tail:
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
+ vmull.u32 $D2,$H2#hi,$R0
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,$R0
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,$R0
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,$R0
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,$R0
+
+ vmlal.u32 $D0,$H4#hi,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#hi,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#hi,$R1
+ vmlal.u32 $D4,$H3#hi,$R1
+ vmlal.u32 $D2,$H1#hi,$R1
+
+ vmlal.u32 $D3,$H1#hi,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#hi,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#hi,$R2
+ vmlal.u32 $D1,$H4#hi,$S2
+ vmlal.u32 $D2,$H0#hi,$R2
+
+ vmlal.u32 $D3,$H0#hi,$R3
+ it ne
+ addne $tbl1,$ctx,#(48+2*9*4)
+ vmlal.u32 $D0,$H2#hi,$S3
+ it ne
+ addne $tbl0,$ctx,#(48+3*9*4)
+ vmlal.u32 $D4,$H1#hi,$R3
+ vmlal.u32 $D1,$H3#hi,$S3
+ vmlal.u32 $D2,$H4#hi,$S3
+
+ vmlal.u32 $D3,$H4#hi,$S4
+ vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
+ vmlal.u32 $D0,$H1#hi,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#hi,$R4
+ vmlal.u32 $D1,$H2#hi,$S4
+ vmlal.u32 $D2,$H3#hi,$S4
+
+ beq .Lshort_tail
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+
+ vmlal.u32 $D2,$H2#lo,$R0
+ vmlal.u32 $D0,$H0#lo,$R0
+ vmlal.u32 $D3,$H3#lo,$R0
+ vmlal.u32 $D1,$H1#lo,$R0
+ vmlal.u32 $D4,$H4#lo,$R0
+
+ vmlal.u32 $D0,$H4#lo,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#lo,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#lo,$R1
+ vmlal.u32 $D4,$H3#lo,$R1
+ vmlal.u32 $D2,$H1#lo,$R1
+
+ vmlal.u32 $D3,$H1#lo,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#lo,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#lo,$R2
+ vmlal.u32 $D1,$H4#lo,$S2
+ vmlal.u32 $D2,$H0#lo,$R2
+
+ vmlal.u32 $D3,$H0#lo,$R3
+ vmlal.u32 $D0,$H2#lo,$S3
+ vmlal.u32 $D4,$H1#lo,$R3
+ vmlal.u32 $D1,$H3#lo,$S3
+ vmlal.u32 $D2,$H4#lo,$S3
+
+ vmlal.u32 $D3,$H4#lo,$S4
+ vorn $MASK,$MASK,$MASK @ all-ones
+ vmlal.u32 $D0,$H1#lo,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#lo,$R4
+ vmlal.u32 $D1,$H2#lo,$S4
+ vmlal.u32 $D2,$H3#lo,$S4
+
+.Lshort_tail:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 $D3#lo,$D3#lo,$D3#hi
+ vadd.i64 $D0#lo,$D0#lo,$D0#hi
+ vadd.i64 $D4#lo,$D4#lo,$D4#hi
+ vadd.i64 $D1#lo,$D1#lo,$D1#hi
+ vadd.i64 $D2#lo,$D2#lo,$D2#hi
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction, but without narrowing
+
+ vshr.u64 $T0,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vshr.u64 $T1,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+
+ vshr.u64 $T0,$D4,#26
+ vand.i64 $D4,$D4,$MASK
+ vshr.u64 $T1,$D1,#26
+ vand.i64 $D1,$D1,$MASK
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+
+ vadd.i64 $D0,$D0,$T0
+ vshl.u64 $T0,$T0,#2
+ vshr.u64 $T1,$D2,#26
+ vand.i64 $D2,$D2,$MASK
+ vadd.i64 $D0,$D0,$T0 @ h4 -> h0
+ vadd.i64 $D3,$D3,$T1 @ h2 -> h3
+
+ vshr.u64 $T0,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vshr.u64 $T1,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vadd.i64 $D1,$D1,$T0 @ h0 -> h1
+ vadd.i64 $D4,$D4,$T1 @ h3 -> h4
+
+ cmp $len,#0
+ bne .Leven
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ store hash value
+
+ vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ vst1.32 {$D4#lo[0]},[$ctx]
+
+ vldmia sp!,{d8-d15} @ epilogue
+ ldmia sp!,{r4-r7}
+ ret @ bx lr
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+#ifndef __KERNEL__
+.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
+.word OPENSSL_armcap_P-.Lpoly1305_init
+# endif
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+#endif
+___
+} }
+$code.=<<___;
+.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/poly1305.h b/lib/crypto/arm/poly1305.h
new file mode 100644
index 000000000000..0fe903d8de55
--- /dev/null
+++ b/lib/crypto/arm/poly1305.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+ u8 digest[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+ unsigned int len, u32 padbit)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ scoped_ksimd()
+ poly1305_blocks_neon(state, src, todo, padbit);
+
+ len -= todo;
+ src += todo;
+ } while (len);
+ } else
+ poly1305_blocks_arm(state, src, len, padbit);
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON)
+ static_branch_enable(&have_neon);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha1-armv4-large.S b/lib/crypto/arm/sha1-armv4-large.S
new file mode 100644
index 000000000000..1c8b685149f2
--- /dev/null
+++ b/lib/crypto/arm/sha1-armv4-large.S
@@ -0,0 +1,507 @@
+#define __ARM_ARCH__ __LINUX_ARM_ARCH__
+@ SPDX-License-Identifier: GPL-2.0
+
+@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
+@ has relicensed it under the GPLv2. Therefore this program is free software;
+@ you can redistribute it and/or modify it under the terms of the GNU General
+@ Public License version 2 as published by the Free Software Foundation.
+@
+@ The original headers, including the original license headers, are
+@ included below for completeness.
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see https://www.openssl.org/~appro/cryptogams/.
+@ ====================================================================
+
+@ sha1_block procedure for ARMv4.
+@
+@ January 2007.
+
+@ Size/performance trade-off
+@ ====================================================================
+@ impl size in bytes comp cycles[*] measured performance
+@ ====================================================================
+@ thumb 304 3212 4420
+@ armv4-small 392/+29% 1958/+64% 2250/+96%
+@ armv4-compact 740/+89% 1552/+26% 1840/+22%
+@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
+@ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
+@ ====================================================================
+@ thumb = same as 'small' but in Thumb instructions[**] and
+@ with recurring code in two private functions;
+@ small = detached Xload/update, loops are folded;
+@ compact = detached Xload/update, 5x unroll;
+@ large = interleaved Xload/update, 5x unroll;
+@ full unroll = interleaved Xload/update, full unroll, estimated[!];
+@
+@ [*] Manually counted instructions in "grand" loop body. Measured
+@ performance is affected by prologue and epilogue overhead,
+@ i-cache availability, branch penalties, etc.
+@ [**] While each Thumb instruction is twice smaller, they are not as
+@ diverse as ARM ones: e.g., there are only two arithmetic
+@ instructions with 3 arguments, no [fixed] rotate, addressing
+@ modes are limited. As result it takes more instructions to do
+@ the same job in Thumb, therefore the code is never twice as
+@ small and always slower.
+@ [***] which is also ~35% better than compiler generated code. Dual-
+@ issue Cortex A8 core was measured to process input block in
+@ ~990 cycles.
+
+@ August 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
+@ Cortex A8 core and in absolute terms ~870 cycles per input block
+@ [or 13.6 cycles per byte].
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 10%
+@ improvement on Cortex A8 core and 12.2 cycles per byte.
+
+#include <linux/linkage.h>
+
+.text
+
+.align 2
+ENTRY(sha1_block_data_order)
+ stmdb sp!,{r4-r12,lr}
+ add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
+ ldmia r0,{r3,r4,r5,r6,r7}
+.Lloop:
+ ldr r8,.LK_00_19
+ mov r14,sp
+ sub sp,sp,#15*4
+ mov r5,r5,ror#30
+ mov r6,r6,ror#30
+ mov r7,r7,ror#30 @ [6]
+.L_00_15:
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ eor r10,r5,r6 @ F_xx_xx
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r4,r10,ror#2
+ add r7,r7,r9 @ E+=X[i]
+ eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r7,r7,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r6,r8,r6,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r4,r5 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r6,r8,r6,ror#2 @ E+=K_00_19
+ eor r10,r4,r5 @ F_xx_xx
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r3,r10,ror#2
+ add r6,r6,r9 @ E+=X[i]
+ eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r6,r6,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r5,r8,r5,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r3,r4 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r5,r8,r5,ror#2 @ E+=K_00_19
+ eor r10,r3,r4 @ F_xx_xx
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r7,r10,ror#2
+ add r5,r5,r9 @ E+=X[i]
+ eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r5,r5,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r4,r8,r4,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r7,r3 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r4,r8,r4,ror#2 @ E+=K_00_19
+ eor r10,r7,r3 @ F_xx_xx
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r6,r10,ror#2
+ add r4,r4,r9 @ E+=X[i]
+ eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r4,r4,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r3,r8,r3,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r6,r7 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r3,r8,r3,ror#2 @ E+=K_00_19
+ eor r10,r6,r7 @ F_xx_xx
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r5,r10,ror#2
+ add r3,r3,r9 @ E+=X[i]
+ eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r3,r3,r10 @ E+=F_00_19(B,C,D)
+ cmp r14,sp
+ bne .L_00_15 @ [((11+4)*5+2)*3]
+ sub sp,sp,#25*4
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ eor r10,r5,r6 @ F_xx_xx
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r4,r10,ror#2
+ add r7,r7,r9 @ E+=X[i]
+ eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r7,r7,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r3,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
+ add r6,r6,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r7,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
+ add r5,r5,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r6,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
+ add r4,r4,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r5,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
+ add r3,r3,r10 @ E+=F_00_19(B,C,D)
+
+ ldr r8,.LK_20_39 @ [+15+16*4]
+ cmn sp,#0 @ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r7,r8,r7,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r5,r6 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r4,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r7,r7,r9 @ E+=X[i]
+ add r7,r7,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r3,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ add r6,r6,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r7,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ add r5,r5,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r6,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ add r4,r4,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r5,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ add r3,r3,r10 @ E+=F_20_39(B,C,D)
+ ARM( teq r14,sp ) @ preserve carry
+ THUMB( mov r11,sp )
+ THUMB( teq r14,r11 ) @ preserve carry
+ bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
+ bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
+
+ ldr r8,.LK_40_59
+ sub sp,sp,#20*4 @ [+2]
+.L_40_59:
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r7,r8,r7,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r5,r6 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r4,r10,ror#2 @ F_xx_xx
+ and r11,r5,r6 @ F_xx_xx
+ add r7,r7,r9 @ E+=X[i]
+ add r7,r7,r10 @ E+=F_40_59(B,C,D)
+ add r7,r7,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r3,r10,ror#2 @ F_xx_xx
+ and r11,r4,r5 @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ add r6,r6,r10 @ E+=F_40_59(B,C,D)
+ add r6,r6,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r7,r10,ror#2 @ F_xx_xx
+ and r11,r3,r4 @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ add r5,r5,r10 @ E+=F_40_59(B,C,D)
+ add r5,r5,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r6,r10,ror#2 @ F_xx_xx
+ and r11,r7,r3 @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ add r4,r4,r10 @ E+=F_40_59(B,C,D)
+ add r4,r4,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r5,r10,ror#2 @ F_xx_xx
+ and r11,r6,r7 @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ add r3,r3,r10 @ E+=F_40_59(B,C,D)
+ add r3,r3,r11,ror#2
+ cmp r14,sp
+ bne .L_40_59 @ [+((12+5)*5+2)*4]
+
+ ldr r8,.LK_60_79
+ sub sp,sp,#20*4
+ cmp sp,#0 @ set carry to denote 60_79
+ b .L_20_39_or_60_79 @ [+4], spare 300 bytes
+.L_done:
+ add sp,sp,#80*4 @ "deallocate" stack frame
+ ldmia r0,{r8,r9,r10,r11,r12}
+ add r3,r8,r3
+ add r4,r9,r4
+ add r5,r10,r5,ror#2
+ add r6,r11,r6,ror#2
+ add r7,r12,r7,ror#2
+ stmia r0,{r3,r4,r5,r6,r7}
+ teq r1,r2
+ bne .Lloop @ [+18], total 1307
+
+ ldmia sp!,{r4-r12,pc}
+.align 2
+.LK_00_19: .word 0x5a827999
+.LK_20_39: .word 0x6ed9eba1
+.LK_40_59: .word 0x8f1bbcdc
+.LK_60_79: .word 0xca62c1d6
+ENDPROC(sha1_block_data_order)
+.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
+.align 2
diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S
new file mode 100644
index 000000000000..a0323fa5c58a
--- /dev/null
+++ b/lib/crypto/arm/sha1-armv7-neon.S
@@ -0,0 +1,633 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* ARM/NEON accelerated SHA-1 transform function
+ *
+ * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.syntax unified
+.fpu neon
+
+.text
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define K1 0x5A827999
+#define K2 0x6ED9EBA1
+#define K3 0x8F1BBCDC
+#define K4 0xCA62C1D6
+.align 4
+.LK_VEC:
+.LK1: .long K1, K1, K1, K1
+.LK2: .long K2, K2, K2, K2
+.LK3: .long K3, K3, K3, K3
+.LK4: .long K4, K4, K4, K4
+
+
+/* Register macros */
+
+#define RSTATE r0
+#define RDATA r1
+#define RNBLKS r2
+#define ROLDSTACK r3
+#define RWK lr
+
+#define _a r4
+#define _b r5
+#define _c r6
+#define _d r7
+#define _e r8
+
+#define RT0 r9
+#define RT1 r10
+#define RT2 r11
+#define RT3 r12
+
+#define W0 q0
+#define W1 q7
+#define W2 q2
+#define W3 q3
+#define W4 q4
+#define W5 q6
+#define W6 q5
+#define W7 q1
+
+#define tmp0 q8
+#define tmp1 q9
+#define tmp2 q10
+#define tmp3 q11
+
+#define qK1 q12
+#define qK2 q13
+#define qK3 q14
+#define qK4 q15
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define ARM_LE(code...)
+#else
+#define ARM_LE(code...) code
+#endif
+
+/* Round function macros. */
+
+#define WK_offs(i) (((i) & 15) * 4)
+
+#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ bic RT0, d, b; \
+ add e, e, a, ror #(32 - 5); \
+ and RT1, c, b; \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add RT0, RT0, RT3; \
+ add e, e, RT1; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT0;
+
+#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ eor RT0, d, b; \
+ add e, e, a, ror #(32 - 5); \
+ eor RT0, RT0, c; \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT3; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT0; \
+
+#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ eor RT0, b, c; \
+ and RT1, b, c; \
+ add e, e, a, ror #(32 - 5); \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ and RT0, RT0, d; \
+ add RT1, RT1, RT3; \
+ add e, e, RT0; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT1;
+
+#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define R(a,b,c,d,e,f,i) \
+ _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define dummy(...)
+
+
+/* Input expansion macros. */
+
+/********* Precalc macros for rounds 0-15 *************************************/
+
+#define W_PRECALC_00_15() \
+ add RWK, sp, #(WK_offs(0)); \
+ \
+ vld1.32 {W0, W7}, [RDATA]!; \
+ ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
+ vld1.32 {W6, W5}, [RDATA]!; \
+ vadd.u32 tmp0, W0, curK; \
+ ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
+ ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
+ vadd.u32 tmp1, W7, curK; \
+ ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
+ vadd.u32 tmp2, W6, curK; \
+ vst1.32 {tmp0, tmp1}, [RWK]!; \
+ vadd.u32 tmp3, W5, curK; \
+ vst1.32 {tmp2, tmp3}, [RWK]; \
+
+#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vld1.32 {W0, W7}, [RDATA]!; \
+
+#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(0)); \
+
+#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
+
+#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vld1.32 {W6, W5}, [RDATA]!; \
+
+#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W0, curK; \
+
+#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
+
+#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
+
+#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp1, W7, curK; \
+
+#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
+
+#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp2, W6, curK; \
+
+#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0, tmp1}, [RWK]!; \
+
+#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp3, W5, curK; \
+
+#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp2, tmp3}, [RWK]; \
+
+
+/********* Precalc macros for rounds 16-31 ************************************/
+
+#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0; \
+ vext.8 W, W_m16, W_m12, #8; \
+
+#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(i)); \
+ vext.8 tmp0, W_m04, tmp0, #4; \
+
+#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0, W_m16; \
+ veor.32 W, W, W_m08; \
+
+#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp1, tmp1; \
+ veor W, W, tmp0; \
+
+#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp0, W, #1; \
+
+#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vext.8 tmp1, tmp1, W, #(16-12); \
+ vshr.u32 W, W, #31; \
+
+#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vorr tmp0, tmp0, W; \
+ vshr.u32 W, tmp1, #30; \
+
+#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp1, tmp1, #2; \
+
+#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0, W; \
+
+#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, tmp0, tmp1; \
+
+#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0}, [RWK];
+
+
+/********* Precalc macros for rounds 32-79 ************************************/
+
+#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, W_m28; \
+
+#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vext.8 tmp0, W_m08, W_m04, #8; \
+
+#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, W_m16; \
+
+#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, tmp0; \
+
+#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(i&~3)); \
+
+#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp1, W, #2; \
+
+#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshr.u32 tmp0, W, #30; \
+
+#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vorr W, tmp0, tmp1; \
+
+#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0}, [RWK];
+
+
+/*
+ * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA.
+ *
+ * void sha1_transform_neon(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+.align 3
+ENTRY(sha1_transform_neon)
+ /* input:
+ * r0: state
+ * r1: data (64*nblocks bytes)
+ * r2: nblocks
+ */
+
+ cmp RNBLKS, #0;
+ beq .Ldo_nothing;
+
+ push {r4-r12, lr};
+ /*vpush {q4-q7};*/
+
+ adr RT3, .LK_VEC;
+
+ mov ROLDSTACK, sp;
+
+ /* Align stack. */
+ sub RT0, sp, #(16*4);
+ and RT0, #(~(16-1));
+ mov sp, RT0;
+
+ vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
+
+ /* Get the values of the chaining variables. */
+ ldm RSTATE, {_a-_e};
+
+ vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
+
+#undef curK
+#define curK qK1
+ /* Precalc 0-15. */
+ W_PRECALC_00_15();
+
+.Loop:
+ /* Transform 0-15 + Precalc 16-31. */
+ _R( _a, _b, _c, _d, _e, F1, 0,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 1,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F1, 2,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F1, 3,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
+ W4, W5, W6, W7, W0, _, _, _ );
+
+#undef curK
+#define curK qK2
+ _R( _b, _c, _d, _e, _a, F1, 4,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 5,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 6,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F1, 7,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
+ W3, W4, W5, W6, W7, _, _, _ );
+
+ _R( _c, _d, _e, _a, _b, F1, 8,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F1, 9,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 10,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 11,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
+ W2, W3, W4, W5, W6, _, _, _ );
+
+ _R( _d, _e, _a, _b, _c, F1, 12,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F1, 13,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F1, 14,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 15,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
+ W1, W2, W3, W4, W5, _, _, _ );
+
+ /* Transform 16-63 + Precalc 32-79. */
+ _R( _e, _a, _b, _c, _d, F1, 16,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _d, _e, _a, _b, _c, F1, 17,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _c, _d, _e, _a, _b, F1, 18,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _b, _c, _d, _e, _a, F1, 19,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+
+ _R( _a, _b, _c, _d, _e, F2, 20,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _e, _a, _b, _c, _d, F2, 21,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _d, _e, _a, _b, _c, F2, 22,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _c, _d, _e, _a, _b, F2, 23,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+
+#undef curK
+#define curK qK3
+ _R( _b, _c, _d, _e, _a, F2, 24,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _a, _b, _c, _d, _e, F2, 25,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _e, _a, _b, _c, _d, F2, 26,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _d, _e, _a, _b, _c, F2, 27,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+
+ _R( _c, _d, _e, _a, _b, F2, 28,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _b, _c, _d, _e, _a, F2, 29,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _a, _b, _c, _d, _e, F2, 30,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _e, _a, _b, _c, _d, F2, 31,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+
+ _R( _d, _e, _a, _b, _c, F2, 32,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _c, _d, _e, _a, _b, F2, 33,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _b, _c, _d, _e, _a, F2, 34,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _a, _b, _c, _d, _e, F2, 35,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+
+ _R( _e, _a, _b, _c, _d, F2, 36,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _d, _e, _a, _b, _c, F2, 37,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _c, _d, _e, _a, _b, F2, 38,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _b, _c, _d, _e, _a, F2, 39,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+
+ _R( _a, _b, _c, _d, _e, F3, 40,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _e, _a, _b, _c, _d, F3, 41,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _d, _e, _a, _b, _c, F3, 42,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _c, _d, _e, _a, _b, F3, 43,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+
+#undef curK
+#define curK qK4
+ _R( _b, _c, _d, _e, _a, F3, 44,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _a, _b, _c, _d, _e, F3, 45,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _e, _a, _b, _c, _d, F3, 46,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _d, _e, _a, _b, _c, F3, 47,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+
+ _R( _c, _d, _e, _a, _b, F3, 48,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _b, _c, _d, _e, _a, F3, 49,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _a, _b, _c, _d, _e, F3, 50,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _e, _a, _b, _c, _d, F3, 51,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+
+ _R( _d, _e, _a, _b, _c, F3, 52,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _c, _d, _e, _a, _b, F3, 53,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _b, _c, _d, _e, _a, F3, 54,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _a, _b, _c, _d, _e, F3, 55,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+
+ _R( _e, _a, _b, _c, _d, F3, 56,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _d, _e, _a, _b, _c, F3, 57,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _c, _d, _e, _a, _b, F3, 58,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _b, _c, _d, _e, _a, F3, 59,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+
+ subs RNBLKS, #1;
+
+ _R( _a, _b, _c, _d, _e, F4, 60,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _e, _a, _b, _c, _d, F4, 61,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _d, _e, _a, _b, _c, F4, 62,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _c, _d, _e, _a, _b, F4, 63,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+
+ beq .Lend;
+
+ /* Transform 64-79 + Precalc 0-15 of next block. */
+#undef curK
+#define curK qK1
+ _R( _b, _c, _d, _e, _a, F4, 64,
+ WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 65,
+ WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F4, 66,
+ WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F4, 67,
+ WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _c, _d, _e, _a, _b, F4, 68,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 69,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 70,
+ WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F4, 71,
+ WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _d, _e, _a, _b, _c, F4, 72,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F4, 73,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 74,
+ WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 75,
+ WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _e, _a, _b, _c, _d, F4, 76,
+ WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F4, 77,
+ WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F4, 78,
+ WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 79,
+ WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
+
+ /* Update the chaining variables. */
+ ldm RSTATE, {RT0-RT3};
+ add _a, RT0;
+ ldr RT0, [RSTATE, #state_h4];
+ add _b, RT1;
+ add _c, RT2;
+ add _d, RT3;
+ add _e, RT0;
+ stm RSTATE, {_a-_e};
+
+ b .Loop;
+
+.Lend:
+ /* Transform 64-79 */
+ R( _b, _c, _d, _e, _a, F4, 64 );
+ R( _a, _b, _c, _d, _e, F4, 65 );
+ R( _e, _a, _b, _c, _d, F4, 66 );
+ R( _d, _e, _a, _b, _c, F4, 67 );
+ R( _c, _d, _e, _a, _b, F4, 68 );
+ R( _b, _c, _d, _e, _a, F4, 69 );
+ R( _a, _b, _c, _d, _e, F4, 70 );
+ R( _e, _a, _b, _c, _d, F4, 71 );
+ R( _d, _e, _a, _b, _c, F4, 72 );
+ R( _c, _d, _e, _a, _b, F4, 73 );
+ R( _b, _c, _d, _e, _a, F4, 74 );
+ R( _a, _b, _c, _d, _e, F4, 75 );
+ R( _e, _a, _b, _c, _d, F4, 76 );
+ R( _d, _e, _a, _b, _c, F4, 77 );
+ R( _c, _d, _e, _a, _b, F4, 78 );
+ R( _b, _c, _d, _e, _a, F4, 79 );
+
+ mov sp, ROLDSTACK;
+
+ /* Update the chaining variables. */
+ ldm RSTATE, {RT0-RT3};
+ add _a, RT0;
+ ldr RT0, [RSTATE, #state_h4];
+ add _b, RT1;
+ add _c, RT2;
+ add _d, RT3;
+ /*vpop {q4-q7};*/
+ add _e, RT0;
+ stm RSTATE, {_a-_e};
+
+ pop {r4-r12, pc};
+
+.Ldo_nothing:
+ bx lr
+ENDPROC(sha1_transform_neon)
diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S
new file mode 100644
index 000000000000..7d6b2631ca8d
--- /dev/null
+++ b/lib/crypto/arm/sha1-ce-core.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .arch armv8-a
+ .fpu crypto-neon-fp-armv8
+
+ k0 .req q0
+ k1 .req q1
+ k2 .req q2
+ k3 .req q3
+
+ ta0 .req q4
+ ta1 .req q5
+ tb0 .req q5
+ tb1 .req q4
+
+ dga .req q6
+ dgb .req q7
+ dgbs .req s28
+
+ dg0 .req q12
+ dg1a0 .req q13
+ dg1a1 .req q14
+ dg1b0 .req q14
+ dg1b1 .req q13
+
+ .macro add_only, op, ev, rc, s0, dg1
+ .ifnb \s0
+ vadd.u32 tb\ev, q\s0, \rc
+ .endif
+ sha1h.32 dg1b\ev, dg0
+ .ifb \dg1
+ sha1\op\().32 dg0, dg1a\ev, ta\ev
+ .else
+ sha1\op\().32 dg0, \dg1, ta\ev
+ .endif
+ .endm
+
+ .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
+ sha1su0.32 q\s0, q\s1, q\s2
+ add_only \op, \ev, \rc, \s1, \dg1
+ sha1su1.32 q\s0, q\s3
+ .endm
+
+ .align 6
+.Lsha1_rcon:
+ .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+ .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+ .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+ .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+ /*
+ * void sha1_ce_transform(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+ENTRY(sha1_ce_transform)
+ /* load round constants */
+ adr ip, .Lsha1_rcon
+ vld1.32 {k0-k1}, [ip, :128]!
+ vld1.32 {k2-k3}, [ip, :128]
+
+ /* load state */
+ vld1.32 {dga}, [r0]
+ vldr dgbs, [r0, #16]
+
+ /* load input */
+0: vld1.32 {q8-q9}, [r1]!
+ vld1.32 {q10-q11}, [r1]!
+ subs r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+ vrev32.8 q8, q8
+ vrev32.8 q9, q9
+ vrev32.8 q10, q10
+ vrev32.8 q11, q11
+#endif
+
+ vadd.u32 ta0, q8, k0
+ vmov dg0, dga
+
+ add_update c, 0, k0, 8, 9, 10, 11, dgb
+ add_update c, 1, k0, 9, 10, 11, 8
+ add_update c, 0, k0, 10, 11, 8, 9
+ add_update c, 1, k0, 11, 8, 9, 10
+ add_update c, 0, k1, 8, 9, 10, 11
+
+ add_update p, 1, k1, 9, 10, 11, 8
+ add_update p, 0, k1, 10, 11, 8, 9
+ add_update p, 1, k1, 11, 8, 9, 10
+ add_update p, 0, k1, 8, 9, 10, 11
+ add_update p, 1, k2, 9, 10, 11, 8
+
+ add_update m, 0, k2, 10, 11, 8, 9
+ add_update m, 1, k2, 11, 8, 9, 10
+ add_update m, 0, k2, 8, 9, 10, 11
+ add_update m, 1, k2, 9, 10, 11, 8
+ add_update m, 0, k3, 10, 11, 8, 9
+
+ add_update p, 1, k3, 11, 8, 9, 10
+ add_only p, 0, k3, 9
+ add_only p, 1, k3, 10
+ add_only p, 0, k3, 11
+ add_only p, 1
+
+ /* update state */
+ vadd.u32 dga, dga, dg0
+ vadd.u32 dgb, dgb, dg1a0
+ bne 0b
+
+ /* store new state */
+ vst1.32 {dga}, [r0]
+ vstr dgbs, [r0, #16]
+ bx lr
+ENDPROC(sha1_ce_transform)
diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h
new file mode 100644
index 000000000000..3e2d8c7cab9f
--- /dev/null
+++ b/lib/crypto/arm/sha1.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage void sha1_block_data_order(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha1_transform_neon(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha1_ce_transform(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ scoped_ksimd() {
+ if (static_branch_likely(&have_ce))
+ sha1_ce_transform(state, data, nblocks);
+ else
+ sha1_transform_neon(state, data, nblocks);
+ }
+ } else {
+ sha1_block_data_order(state, data, nblocks);
+ }
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON) {
+ static_branch_enable(&have_neon);
+ if (elf_hwcap2 & HWCAP2_SHA1)
+ static_branch_enable(&have_ce);
+ }
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha256-armv4.pl b/lib/crypto/arm/sha256-armv4.pl
new file mode 100644
index 000000000000..f3a2b54efd4e
--- /dev/null
+++ b/lib/crypto/arm/sha256-armv4.pl
@@ -0,0 +1,724 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0"; $t0="r0";
+$inp="r1"; $t4="r1";
+$len="r2"; $t1="r2";
+$T1="r3"; $t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+ @ ldr $t1,[$inp],#4 @ $i
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+# ifndef __ARMEB__
+ rev $t1,$t1
+# endif
+#else
+ @ ldrb $t1,[$inp,#3] @ $i
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ ldrb $t2,[$inp,#2]
+ ldrb $t0,[$inp,#1]
+ orr $t1,$t1,$t2,lsl#8
+ ldrb $t2,[$inp],#4
+ orr $t1,$t1,$t0,lsl#16
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ orr $t1,$t1,$t2,lsl#24
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+#endif
+___
+$code.=<<___;
+ ldr $t2,[$Ktbl],#4 @ *K256++
+ add $h,$h,$t1 @ h+=X[i]
+ str $t1,[sp,#`$i%16`*4]
+ eor $t1,$f,$g
+ add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
+ and $t1,$t1,$e
+ add $h,$h,$t2 @ h+=K256[i]
+ eor $t1,$t1,$g @ Ch(e,f,g)
+ eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+ add $h,$h,$t1 @ h+=Ch(e,f,g)
+#if $i==31
+ and $t2,$t2,#0xff
+ cmp $t2,#0xf2 @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4 @ prefetch
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t2,$a,$b @ a^b, b^c in next round
+#else
+ ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
+ eor $t2,$a,$b @ a^b, b^c in next round
+ ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
+#endif
+ eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
+ and $t3,$t3,$t2 @ (b^c)&=(a^b)
+ add $d,$d,$h @ d+=h
+ eor $t3,$t3,$b @ Maj(a,b,c)
+ add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
+ @ add $h,$h,$t3 @ h+=Maj(a,b,c)
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+ @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ @ ldr $t4,[sp,#`($i+14)%16`*4]
+ mov $t0,$t1,ror#$sigma0[0]
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ mov $t2,$t4,ror#$sigma1[0]
+ eor $t0,$t0,$t1,ror#$sigma0[1]
+ eor $t2,$t2,$t4,ror#$sigma1[1]
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
+ ldr $t1,[sp,#`($i+0)%16`*4]
+ eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
+ ldr $t4,[sp,#`($i+9)%16`*4]
+
+ add $t2,$t2,$t0
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
+ add $t1,$t1,$t2
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ add $t1,$t1,$t4 @ X[i]
+___
+ &BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,.Lsha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
+ ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+ sub $Ktbl,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t3,$B,$C @ magic
+ eor $t2,$t2,$t2
+___
+for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq $t3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t0,[$t3,#0]
+ ldr $t1,[$t3,#4]
+ ldr $t2,[$t3,#8]
+ add $A,$A,$t0
+ ldr $t0,[$t3,#12]
+ add $B,$B,$t1
+ ldr $t1,[$t3,#16]
+ add $C,$C,$t2
+ ldr $t2,[$t3,#20]
+ add $D,$D,$t0
+ ldr $t0,[$t3,#24]
+ add $E,$E,$t1
+ ldr $t1,[$t3,#28]
+ add $F,$F,$t2
+ ldr $inp,[sp,#17*4] @ pull inp
+ ldr $t2,[sp,#18*4] @ pull inp+len
+ add $G,$G,$t0
+ add $H,$H,$t1
+ stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+ cmp $inp,$t2
+ sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#`16+3`*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ while($#insns>=2) { eval(shift(@insns)); }
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&eor ($t1,$f,$g)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&and ($t1,$t1,$e)',
+ '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&eor ($t1,$t1,$g)', # Ch(e,f,g)
+ '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&ldr ($t1,"[sp,#64]") if ($j==31)',
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&add ($d,$d,$h)', # d+=h
+ '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub $H,sp,#16*4+16
+ adr $Ktbl,.Lsha256_block_data_order
+ sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
+ bic $H,$H,#15 @ align for 128-bit stores
+ mov $t2,sp
+ mov sp,$H @ alloca
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {@X[0]},[$inp]!
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ vld1.32 {$T0},[$Ktbl,:128]!
+ vld1.32 {$T1},[$Ktbl,:128]!
+ vld1.32 {$T2},[$Ktbl,:128]!
+ vld1.32 {$T3},[$Ktbl,:128]!
+ vrev32.8 @X[0],@X[0] @ yes, even on
+ str $ctx,[sp,#64]
+ vrev32.8 @X[1],@X[1] @ big-endian
+ str $inp,[sp,#68]
+ mov $Xfer,sp
+ vrev32.8 @X[2],@X[2]
+ str $len,[sp,#72]
+ vrev32.8 @X[3],@X[3]
+ str $t2,[sp,#76] @ save original sp
+ vadd.i32 $T0,$T0,@X[0]
+ vadd.i32 $T1,$T1,@X[1]
+ vst1.32 {$T0},[$Xfer,:128]!
+ vadd.i32 $T2,$T2,@X[2]
+ vst1.32 {$T1},[$Xfer,:128]!
+ vadd.i32 $T3,$T3,@X[3]
+ vst1.32 {$T2},[$Xfer,:128]!
+ vst1.32 {$T3},[$Xfer,:128]!
+
+ ldmia $ctx,{$A-$H}
+ sub $Xfer,$Xfer,#64
+ ldr $t1,[sp,#0]
+ eor $t2,$t2,$t2
+ eor $t3,$B,$C
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ teq $t1,#0 @ check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ ldr $inp,[sp,#68]
+ ldr $t0,[sp,#72]
+ sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
+ teq $inp,$t0
+ it eq
+ subeq $inp,$inp,#64 @ avoid SEGV
+ vld1.8 {@X[0]},[$inp]! @ load next input block
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ it ne
+ strne $inp,[sp,#68]
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ ldr $t0,[$t1,#0]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t2,[$t1,#4]
+ ldr $t3,[$t1,#8]
+ ldr $t4,[$t1,#12]
+ add $A,$A,$t0 @ accumulate
+ ldr $t0,[$t1,#16]
+ add $B,$B,$t2
+ ldr $t2,[$t1,#20]
+ add $C,$C,$t3
+ ldr $t3,[$t1,#24]
+ add $D,$D,$t4
+ ldr $t4,[$t1,#28]
+ add $E,$E,$t0
+ str $A,[$t1],#4
+ add $F,$F,$t2
+ str $B,[$t1],#4
+ add $G,$G,$t3
+ str $C,[$t1],#4
+ add $H,$H,$t4
+ str $D,[$t1],#4
+ stmia $t1,{$E-$H}
+
+ ittte ne
+ movne $Xfer,sp
+ ldrne $t1,[sp,#0]
+ eorne $t2,$t2,$t2
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne $t3,$B,$C
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+ adr $Ktbl,.LARMv8
+ sub $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+ adrl $Ktbl,K256
+# endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vld1.32 {$W0},[$Ktbl]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+ vrev32.8 @MSG[2],@MSG[2]
+ vrev32.8 @MSG[3],@MSG[3]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ vmov $EFGH_SAVE,$EFGH
+ teq $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vld1.32 {$W0},[$Ktbl]!
+ vadd.i32 $W1,$W1,@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vld1.32 {$W1},[$Ktbl]
+ vadd.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#256-16 @ rewind
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vadd.i32 $W1,$W1,@MSG[3]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {$ABCD,$EFGH},[$ctx]
+
+ ret @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+{ my %opcode = (
+ "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
+ "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S
new file mode 100644
index 000000000000..144ee805f64a
--- /dev/null
+++ b/lib/crypto/arm/sha256-ce.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-224/256 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .arch armv8-a
+ .fpu crypto-neon-fp-armv8
+
+ k0 .req q7
+ k1 .req q8
+ rk .req r3
+
+ ta0 .req q9
+ ta1 .req q10
+ tb0 .req q10
+ tb1 .req q9
+
+ dga .req q11
+ dgb .req q12
+
+ dg0 .req q13
+ dg1 .req q14
+ dg2 .req q15
+
+ .macro add_only, ev, s0
+ vmov dg2, dg0
+ .ifnb \s0
+ vld1.32 {k\ev}, [rk, :128]!
+ .endif
+ sha256h.32 dg0, dg1, tb\ev
+ sha256h2.32 dg1, dg2, tb\ev
+ .ifnb \s0
+ vadd.u32 ta\ev, q\s0, k\ev
+ .endif
+ .endm
+
+ .macro add_update, ev, s0, s1, s2, s3
+ sha256su0.32 q\s0, q\s1
+ add_only \ev, \s1
+ sha256su1.32 q\s0, q\s2, q\s3
+ .endm
+
+ .align 6
+.Lsha256_rcon:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+ /*
+ * void sha256_ce_transform(struct sha256_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+ENTRY(sha256_ce_transform)
+ /* load state */
+ vld1.32 {dga-dgb}, [r0]
+
+ /* load input */
+0: vld1.32 {q0-q1}, [r1]!
+ vld1.32 {q2-q3}, [r1]!
+ subs r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+ vrev32.8 q0, q0
+ vrev32.8 q1, q1
+ vrev32.8 q2, q2
+ vrev32.8 q3, q3
+#endif
+
+ /* load first round constant */
+ adr rk, .Lsha256_rcon
+ vld1.32 {k0}, [rk, :128]!
+
+ vadd.u32 ta0, q0, k0
+ vmov dg0, dga
+ vmov dg1, dgb
+
+ add_update 1, 0, 1, 2, 3
+ add_update 0, 1, 2, 3, 0
+ add_update 1, 2, 3, 0, 1
+ add_update 0, 3, 0, 1, 2
+ add_update 1, 0, 1, 2, 3
+ add_update 0, 1, 2, 3, 0
+ add_update 1, 2, 3, 0, 1
+ add_update 0, 3, 0, 1, 2
+ add_update 1, 0, 1, 2, 3
+ add_update 0, 1, 2, 3, 0
+ add_update 1, 2, 3, 0, 1
+ add_update 0, 3, 0, 1, 2
+
+ add_only 1, 1
+ add_only 0, 2
+ add_only 1, 3
+ add_only 0
+
+ /* update state */
+ vadd.u32 dga, dga, dg0
+ vadd.u32 dgb, dgb, dg1
+ bne 0b
+
+ /* store new state */
+ vst1.32 {dga-dgb}, [r0]
+ bx lr
+ENDPROC(sha256_ce_transform)
diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h
new file mode 100644
index 000000000000..ae7e52dd6e3b
--- /dev/null
+++ b/lib/crypto/arm/sha256.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha256_ce_transform(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ scoped_ksimd() {
+ if (static_branch_likely(&have_ce))
+ sha256_ce_transform(state, data, nblocks);
+ else
+ sha256_block_data_order_neon(state, data, nblocks);
+ }
+ } else {
+ sha256_block_data_order(state, data, nblocks);
+ }
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON) {
+ static_branch_enable(&have_neon);
+ if (elf_hwcap2 & HWCAP2_SHA2)
+ static_branch_enable(&have_ce);
+ }
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha512-armv4.pl b/lib/crypto/arm/sha512-armv4.pl
new file mode 100644
index 000000000000..2fc3516912fa
--- /dev/null
+++ b/lib/crypto/arm/sha512-armv4.pl
@@ -0,0 +1,657 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA512 block procedure for ARMv4. September 2007.
+
+# This code is ~4.5 (four and a half) times faster than code generated
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+# August 2012.
+#
+# Improve NEON performance by 12% on Snapdragon S4. In absolute
+# terms it's 22.6 cycles per byte, which is disappointing result.
+# Technical writers asserted that 3-way S4 pipeline can sustain
+# multiple NEON instructions per cycle, but dual NEON issue could
+# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
+
+# Byte order [in]dependence. =========================================
+#
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
+# ====================================================================
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0"; # parameter block
+$inp="r1";
+$len="r2";
+
+$Tlo="r3";
+$Thi="r4";
+$Alo="r5";
+$Ahi="r6";
+$Elo="r7";
+$Ehi="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+############ r13 is stack pointer
+$Ktbl="r14";
+############ r15 is program counter
+
+$Aoff=8*0;
+$Boff=8*1;
+$Coff=8*2;
+$Doff=8*3;
+$Eoff=8*4;
+$Foff=8*5;
+$Goff=8*6;
+$Hoff=8*7;
+$Xoff=8*8;
+
+sub BODY_00_15() {
+my $magic = shift;
+$code.=<<___;
+ @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+ @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+ @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+ mov $t0,$Elo,lsr#14
+ str $Tlo,[sp,#$Xoff+0]
+ mov $t1,$Ehi,lsr#14
+ str $Thi,[sp,#$Xoff+4]
+ eor $t0,$t0,$Ehi,lsl#18
+ ldr $t2,[sp,#$Hoff+0] @ h.lo
+ eor $t1,$t1,$Elo,lsl#18
+ ldr $t3,[sp,#$Hoff+4] @ h.hi
+ eor $t0,$t0,$Elo,lsr#18
+ eor $t1,$t1,$Ehi,lsr#18
+ eor $t0,$t0,$Ehi,lsl#14
+ eor $t1,$t1,$Elo,lsl#14
+ eor $t0,$t0,$Ehi,lsr#9
+ eor $t1,$t1,$Elo,lsr#9
+ eor $t0,$t0,$Elo,lsl#23
+ eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
+ adds $Tlo,$Tlo,$t0
+ ldr $t0,[sp,#$Foff+0] @ f.lo
+ adc $Thi,$Thi,$t1 @ T += Sigma1(e)
+ ldr $t1,[sp,#$Foff+4] @ f.hi
+ adds $Tlo,$Tlo,$t2
+ ldr $t2,[sp,#$Goff+0] @ g.lo
+ adc $Thi,$Thi,$t3 @ T += h
+ ldr $t3,[sp,#$Goff+4] @ g.hi
+
+ eor $t0,$t0,$t2
+ str $Elo,[sp,#$Eoff+0]
+ eor $t1,$t1,$t3
+ str $Ehi,[sp,#$Eoff+4]
+ and $t0,$t0,$Elo
+ str $Alo,[sp,#$Aoff+0]
+ and $t1,$t1,$Ehi
+ str $Ahi,[sp,#$Aoff+4]
+ eor $t0,$t0,$t2
+ ldr $t2,[$Ktbl,#$lo] @ K[i].lo
+ eor $t1,$t1,$t3 @ Ch(e,f,g)
+ ldr $t3,[$Ktbl,#$hi] @ K[i].hi
+
+ adds $Tlo,$Tlo,$t0
+ ldr $Elo,[sp,#$Doff+0] @ d.lo
+ adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
+ ldr $Ehi,[sp,#$Doff+4] @ d.hi
+ adds $Tlo,$Tlo,$t2
+ and $t0,$t2,#0xff
+ adc $Thi,$Thi,$t3 @ T += K[i]
+ adds $Elo,$Elo,$Tlo
+ ldr $t2,[sp,#$Boff+0] @ b.lo
+ adc $Ehi,$Ehi,$Thi @ d += T
+ teq $t0,#$magic
+
+ ldr $t3,[sp,#$Coff+0] @ c.lo
+#if __ARM_ARCH__>=7
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
+ orreq $Ktbl,$Ktbl,#1
+ @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+ @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+ @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+ mov $t0,$Alo,lsr#28
+ mov $t1,$Ahi,lsr#28
+ eor $t0,$t0,$Ahi,lsl#4
+ eor $t1,$t1,$Alo,lsl#4
+ eor $t0,$t0,$Ahi,lsr#2
+ eor $t1,$t1,$Alo,lsr#2
+ eor $t0,$t0,$Alo,lsl#30
+ eor $t1,$t1,$Ahi,lsl#30
+ eor $t0,$t0,$Ahi,lsr#7
+ eor $t1,$t1,$Alo,lsr#7
+ eor $t0,$t0,$Alo,lsl#25
+ eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
+ adds $Tlo,$Tlo,$t0
+ and $t0,$Alo,$t2
+ adc $Thi,$Thi,$t1 @ T += Sigma0(a)
+
+ ldr $t1,[sp,#$Boff+4] @ b.hi
+ orr $Alo,$Alo,$t2
+ ldr $t2,[sp,#$Coff+4] @ c.hi
+ and $Alo,$Alo,$t3
+ and $t3,$Ahi,$t1
+ orr $Ahi,$Ahi,$t1
+ orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
+ and $Ahi,$Ahi,$t2
+ adds $Alo,$Alo,$Tlo
+ orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
+ sub sp,sp,#8
+ adc $Ahi,$Ahi,$Thi @ h += T
+ tst $Ktbl,#1
+ add $Ktbl,$Ktbl,#8
+___
+}
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K512,%object
+.align 5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha512_block_data_order
+.skip 32-4
+#else
+.skip 32
+#endif
+
+.global sha512_block_data_order
+.type sha512_block_data_order,%function
+sha512_block_data_order:
+.Lsha512_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha512_block_data_order
+#else
+ adr r3,.Lsha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#1
+ bne .LNEON
+#endif
+ add $len,$inp,$len,lsl#7 @ len to point at the end of inp
+ stmdb sp!,{r4-r12,lr}
+ sub $Ktbl,r3,#672 @ K512
+ sub sp,sp,#9*8
+
+ ldr $Elo,[$ctx,#$Eoff+$lo]
+ ldr $Ehi,[$ctx,#$Eoff+$hi]
+ ldr $t0, [$ctx,#$Goff+$lo]
+ ldr $t1, [$ctx,#$Goff+$hi]
+ ldr $t2, [$ctx,#$Hoff+$lo]
+ ldr $t3, [$ctx,#$Hoff+$hi]
+.Loop:
+ str $t0, [sp,#$Goff+0]
+ str $t1, [sp,#$Goff+4]
+ str $t2, [sp,#$Hoff+0]
+ str $t3, [sp,#$Hoff+4]
+ ldr $Alo,[$ctx,#$Aoff+$lo]
+ ldr $Ahi,[$ctx,#$Aoff+$hi]
+ ldr $Tlo,[$ctx,#$Boff+$lo]
+ ldr $Thi,[$ctx,#$Boff+$hi]
+ ldr $t0, [$ctx,#$Coff+$lo]
+ ldr $t1, [$ctx,#$Coff+$hi]
+ ldr $t2, [$ctx,#$Doff+$lo]
+ ldr $t3, [$ctx,#$Doff+$hi]
+ str $Tlo,[sp,#$Boff+0]
+ str $Thi,[sp,#$Boff+4]
+ str $t0, [sp,#$Coff+0]
+ str $t1, [sp,#$Coff+4]
+ str $t2, [sp,#$Doff+0]
+ str $t3, [sp,#$Doff+4]
+ ldr $Tlo,[$ctx,#$Foff+$lo]
+ ldr $Thi,[$ctx,#$Foff+$hi]
+ str $Tlo,[sp,#$Foff+0]
+ str $Thi,[sp,#$Foff+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+ ldrb $Tlo,[$inp,#7]
+ ldrb $t0, [$inp,#6]
+ ldrb $t1, [$inp,#5]
+ ldrb $t2, [$inp,#4]
+ ldrb $Thi,[$inp,#3]
+ ldrb $t3, [$inp,#2]
+ orr $Tlo,$Tlo,$t0,lsl#8
+ ldrb $t0, [$inp,#1]
+ orr $Tlo,$Tlo,$t1,lsl#16
+ ldrb $t1, [$inp],#8
+ orr $Tlo,$Tlo,$t2,lsl#24
+ orr $Thi,$Thi,$t3,lsl#8
+ orr $Thi,$Thi,$t0,lsl#16
+ orr $Thi,$Thi,$t1,lsl#24
+#else
+ ldr $Tlo,[$inp,#4]
+ ldr $Thi,[$inp],#8
+#ifdef __ARMEL__
+ rev $Tlo,$Tlo
+ rev $Thi,$Thi
+#endif
+#endif
+___
+ &BODY_00_15(0x94);
+$code.=<<___;
+ tst $Ktbl,#1
+ beq .L00_15
+ ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
+ ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
+ bic $Ktbl,$Ktbl,#1
+.L16_79:
+ @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
+ @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+ @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
+ mov $Tlo,$t0,lsr#1
+ ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
+ mov $Thi,$t1,lsr#1
+ ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
+ eor $Tlo,$Tlo,$t1,lsl#31
+ eor $Thi,$Thi,$t0,lsl#31
+ eor $Tlo,$Tlo,$t0,lsr#8
+ eor $Thi,$Thi,$t1,lsr#8
+ eor $Tlo,$Tlo,$t1,lsl#24
+ eor $Thi,$Thi,$t0,lsl#24
+ eor $Tlo,$Tlo,$t0,lsr#7
+ eor $Thi,$Thi,$t1,lsr#7
+ eor $Tlo,$Tlo,$t1,lsl#25
+
+ @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+ @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+ @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+ mov $t0,$t2,lsr#19
+ mov $t1,$t3,lsr#19
+ eor $t0,$t0,$t3,lsl#13
+ eor $t1,$t1,$t2,lsl#13
+ eor $t0,$t0,$t3,lsr#29
+ eor $t1,$t1,$t2,lsr#29
+ eor $t0,$t0,$t2,lsl#3
+ eor $t1,$t1,$t3,lsl#3
+ eor $t0,$t0,$t2,lsr#6
+ eor $t1,$t1,$t3,lsr#6
+ ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
+ eor $t0,$t0,$t3,lsl#26
+
+ ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
+ adds $Tlo,$Tlo,$t0
+ ldr $t0,[sp,#`$Xoff+8*16`+0]
+ adc $Thi,$Thi,$t1
+
+ ldr $t1,[sp,#`$Xoff+8*16`+4]
+ adds $Tlo,$Tlo,$t2
+ adc $Thi,$Thi,$t3
+ adds $Tlo,$Tlo,$t0
+ adc $Thi,$Thi,$t1
+___
+ &BODY_00_15(0x17);
+$code.=<<___;
+#if __ARM_ARCH__>=7
+ ittt eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
+ ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
+ beq .L16_79
+ bic $Ktbl,$Ktbl,#1
+
+ ldr $Tlo,[sp,#$Boff+0]
+ ldr $Thi,[sp,#$Boff+4]
+ ldr $t0, [$ctx,#$Aoff+$lo]
+ ldr $t1, [$ctx,#$Aoff+$hi]
+ ldr $t2, [$ctx,#$Boff+$lo]
+ ldr $t3, [$ctx,#$Boff+$hi]
+ adds $t0,$Alo,$t0
+ str $t0, [$ctx,#$Aoff+$lo]
+ adc $t1,$Ahi,$t1
+ str $t1, [$ctx,#$Aoff+$hi]
+ adds $t2,$Tlo,$t2
+ str $t2, [$ctx,#$Boff+$lo]
+ adc $t3,$Thi,$t3
+ str $t3, [$ctx,#$Boff+$hi]
+
+ ldr $Alo,[sp,#$Coff+0]
+ ldr $Ahi,[sp,#$Coff+4]
+ ldr $Tlo,[sp,#$Doff+0]
+ ldr $Thi,[sp,#$Doff+4]
+ ldr $t0, [$ctx,#$Coff+$lo]
+ ldr $t1, [$ctx,#$Coff+$hi]
+ ldr $t2, [$ctx,#$Doff+$lo]
+ ldr $t3, [$ctx,#$Doff+$hi]
+ adds $t0,$Alo,$t0
+ str $t0, [$ctx,#$Coff+$lo]
+ adc $t1,$Ahi,$t1
+ str $t1, [$ctx,#$Coff+$hi]
+ adds $t2,$Tlo,$t2
+ str $t2, [$ctx,#$Doff+$lo]
+ adc $t3,$Thi,$t3
+ str $t3, [$ctx,#$Doff+$hi]
+
+ ldr $Tlo,[sp,#$Foff+0]
+ ldr $Thi,[sp,#$Foff+4]
+ ldr $t0, [$ctx,#$Eoff+$lo]
+ ldr $t1, [$ctx,#$Eoff+$hi]
+ ldr $t2, [$ctx,#$Foff+$lo]
+ ldr $t3, [$ctx,#$Foff+$hi]
+ adds $Elo,$Elo,$t0
+ str $Elo,[$ctx,#$Eoff+$lo]
+ adc $Ehi,$Ehi,$t1
+ str $Ehi,[$ctx,#$Eoff+$hi]
+ adds $t2,$Tlo,$t2
+ str $t2, [$ctx,#$Foff+$lo]
+ adc $t3,$Thi,$t3
+ str $t3, [$ctx,#$Foff+$hi]
+
+ ldr $Alo,[sp,#$Goff+0]
+ ldr $Ahi,[sp,#$Goff+4]
+ ldr $Tlo,[sp,#$Hoff+0]
+ ldr $Thi,[sp,#$Hoff+4]
+ ldr $t0, [$ctx,#$Goff+$lo]
+ ldr $t1, [$ctx,#$Goff+$hi]
+ ldr $t2, [$ctx,#$Hoff+$lo]
+ ldr $t3, [$ctx,#$Hoff+$hi]
+ adds $t0,$Alo,$t0
+ str $t0, [$ctx,#$Goff+$lo]
+ adc $t1,$Ahi,$t1
+ str $t1, [$ctx,#$Goff+$hi]
+ adds $t2,$Tlo,$t2
+ str $t2, [$ctx,#$Hoff+$lo]
+ adc $t3,$Thi,$t3
+ str $t3, [$ctx,#$Hoff+$hi]
+
+ add sp,sp,#640
+ sub $Ktbl,$Ktbl,#640
+
+ teq $inp,$len
+ bne .Loop
+
+ add sp,sp,#8*9 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
+ ldmia sp!,{r4-r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size sha512_block_data_order,.-sha512_block_data_order
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
+
+$code.=<<___ if ($i<16 || $i&1);
+ vshr.u64 $t0,$e,#@Sigma1[0] @ $i
+#if $i<16
+ vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
+#endif
+ vshr.u64 $t1,$e,#@Sigma1[1]
+#if $i>0
+ vadd.i64 $a,$Maj @ h+=Maj from the past
+#endif
+ vshr.u64 $t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+ vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
+ vsli.64 $t0,$e,#`64-@Sigma1[0]`
+ vsli.64 $t1,$e,#`64-@Sigma1[1]`
+ vmov $Ch,$e
+ vsli.64 $t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+ vrev64.8 @X[$i],@X[$i]
+#endif
+ veor $t1,$t0
+ vbsl $Ch,$f,$g @ Ch(e,f,g)
+ vshr.u64 $t0,$a,#@Sigma0[0]
+ veor $t2,$t1 @ Sigma1(e)
+ vadd.i64 $T1,$Ch,$h
+ vshr.u64 $t1,$a,#@Sigma0[1]
+ vsli.64 $t0,$a,#`64-@Sigma0[0]`
+ vadd.i64 $T1,$t2
+ vshr.u64 $t2,$a,#@Sigma0[2]
+ vadd.i64 $K,@X[$i%16]
+ vsli.64 $t1,$a,#`64-@Sigma0[1]`
+ veor $Maj,$a,$b
+ vsli.64 $t2,$a,#`64-@Sigma0[2]`
+ veor $h,$t0,$t1
+ vadd.i64 $T1,$K
+ vbsl $Maj,$c,$b @ Maj(a,b,c)
+ veor $h,$t2 @ Sigma0(a)
+ vadd.i64 $d,$T1
+ vadd.i64 $Maj,$T1
+ @ vadd.i64 $h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1) { &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7)); # view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
+my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
+my $e=@_[4]; # $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+ vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
+ vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
+ vadd.i64 @_[0],d30 @ h+=Maj from the past
+ vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
+ vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
+ vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
+ vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
+ veor $s1,$t0
+ vshr.u64 $t0,$s0,#@sigma0[0]
+ veor $s1,$t1 @ sigma1(X[i+14])
+ vshr.u64 $t1,$s0,#@sigma0[1]
+ vadd.i64 @X[$i%8],$s1
+ vshr.u64 $s1,$s0,#@sigma0[2]
+ vsli.64 $t0,$s0,#`64-@sigma0[0]`
+ vsli.64 $t1,$s0,#`64-@sigma0[1]`
+ vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
+ veor $s1,$t0
+ vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
+ vadd.i64 @X[$i%8],$s0
+ vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
+ veor $s1,$t1 @ sigma0(X[i+1])
+ vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
+ vadd.i64 @X[$i%8],$s1
+___
+ &NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha512_block_data_order_neon
+.type sha512_block_data_order_neon,%function
+.align 4
+sha512_block_data_order_neon:
+.LNEON:
+ dmb @ errata #451034 on early Cortex A8
+ add $len,$inp,$len,lsl#7 @ len to point at the end of inp
+ VFP_ABI_PUSH
+ adr $Ktbl,.Lsha512_block_data_order
+ sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
+ vldmia $ctx,{$A-$H} @ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ mov $cnt,#4
+.L16_79_neon:
+ subs $cnt,#1
+___
+for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ bne .L16_79_neon
+
+ vadd.i64 $A,d30 @ h+=Maj from the past
+ vldmia $ctx,{d24-d31} @ load context to temp
+ vadd.i64 q8,q12 @ vectorized accumulate
+ vadd.i64 q9,q13
+ vadd.i64 q10,q14
+ vadd.i64 q11,q15
+ vstmia $ctx,{$A-$H} @ save context
+ teq $inp,$len
+ sub $Ktbl,#640 @ rewind K512
+ bne .Loop_neon
+
+ VFP_ABI_POP
+ ret @ bx lr
+.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+print $code;
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h
new file mode 100644
index 000000000000..ed9bd81d6d78
--- /dev/null
+++ b/lib/crypto/arm/sha512.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * arm32-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void sha512_block_data_order(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha512_block_data_order_neon(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ scoped_ksimd()
+ sha512_block_data_order_neon(state, data, nblocks);
+ } else {
+ sha512_block_data_order(state, data, nblocks);
+ }
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ if (cpu_has_neon())
+ static_branch_enable(&have_neon);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm64/.gitignore b/lib/crypto/arm64/.gitignore
new file mode 100644
index 000000000000..f6c4e8ef80da
--- /dev/null
+++ b/lib/crypto/arm64/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
+sha512-core.S
diff --git a/lib/crypto/arm64/chacha-neon-core.S b/lib/crypto/arm64/chacha-neon-core.S
new file mode 100644
index 000000000000..80079586ecc7
--- /dev/null
+++ b/lib/crypto/arm64/chacha-neon-core.S
@@ -0,0 +1,805 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Originally based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
+
+ .text
+ .align 6
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers v0-v3. It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in w3.
+ *
+ * Clobbers: w3, x10, v4, v12
+ */
+SYM_FUNC_START_LOCAL(chacha_permute)
+
+ adr_l x10, ROT8
+ ld1 {v12.4s}, [x10]
+
+.Ldoubleround:
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ rev32 v3.8h, v3.8h
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #12
+ sri v1.4s, v4.4s, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ tbl v3.16b, {v3.16b}, v12.16b
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #7
+ sri v1.4s, v4.4s, #25
+
+ // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ ext v1.16b, v1.16b, v1.16b, #4
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ ext v2.16b, v2.16b, v2.16b, #8
+ // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ ext v3.16b, v3.16b, v3.16b, #12
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ rev32 v3.8h, v3.8h
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #12
+ sri v1.4s, v4.4s, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ tbl v3.16b, {v3.16b}, v12.16b
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #7
+ sri v1.4s, v4.4s, #25
+
+ // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ ext v1.16b, v1.16b, v1.16b, #12
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ ext v2.16b, v2.16b, v2.16b, #8
+ // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ ext v3.16b, v3.16b, v3.16b, #4
+
+ subs w3, w3, #2
+ b.ne .Ldoubleround
+
+ ret
+SYM_FUNC_END(chacha_permute)
+
+SYM_FUNC_START(chacha_block_xor_neon)
+ // x0: Input state matrix, s
+ // x1: 1 data block output, o
+ // x2: 1 data block input, i
+ // w3: nrounds
+
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ // x0..3 = s0..3
+ ld1 {v0.4s-v3.4s}, [x0]
+ ld1 {v8.4s-v11.4s}, [x0]
+
+ bl chacha_permute
+
+ ld1 {v4.16b-v7.16b}, [x2]
+
+ // o0 = i0 ^ (x0 + s0)
+ add v0.4s, v0.4s, v8.4s
+ eor v0.16b, v0.16b, v4.16b
+
+ // o1 = i1 ^ (x1 + s1)
+ add v1.4s, v1.4s, v9.4s
+ eor v1.16b, v1.16b, v5.16b
+
+ // o2 = i2 ^ (x2 + s2)
+ add v2.4s, v2.4s, v10.4s
+ eor v2.16b, v2.16b, v6.16b
+
+ // o3 = i3 ^ (x3 + s3)
+ add v3.4s, v3.4s, v11.4s
+ eor v3.16b, v3.16b, v7.16b
+
+ st1 {v0.16b-v3.16b}, [x1]
+
+ ldp x29, x30, [sp], #16
+ ret
+SYM_FUNC_END(chacha_block_xor_neon)
+
+SYM_FUNC_START(hchacha_block_neon)
+ // x0: Input state matrix, s
+ // x1: output (8 32-bit words)
+ // w2: nrounds
+
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ ld1 {v0.4s-v3.4s}, [x0]
+
+ mov w3, w2
+ bl chacha_permute
+
+ st1 {v0.4s}, [x1], #16
+ st1 {v3.4s}, [x1]
+
+ ldp x29, x30, [sp], #16
+ ret
+SYM_FUNC_END(hchacha_block_neon)
+
+ a0 .req w12
+ a1 .req w13
+ a2 .req w14
+ a3 .req w15
+ a4 .req w16
+ a5 .req w17
+ a6 .req w19
+ a7 .req w20
+ a8 .req w21
+ a9 .req w22
+ a10 .req w23
+ a11 .req w24
+ a12 .req w25
+ a13 .req w26
+ a14 .req w27
+ a15 .req w28
+
+ .align 6
+SYM_FUNC_START(chacha_4block_xor_neon)
+ frame_push 10
+
+ // x0: Input state matrix, s
+ // x1: 4 data blocks output, o
+ // x2: 4 data blocks input, i
+ // w3: nrounds
+ // x4: byte count
+
+ adr_l x10, .Lpermute
+ and x5, x4, #63
+ add x10, x10, x5
+
+ //
+ // This function encrypts four consecutive ChaCha blocks by loading
+ // the state matrix in NEON registers four times. The algorithm performs
+ // each operation on the corresponding word of each state matrix, hence
+ // requires no word shuffling. For final XORing step we transpose the
+ // matrix by interleaving 32- and then 64-bit words, which allows us to
+ // do XOR in NEON registers.
+ //
+ // At the same time, a fifth block is encrypted in parallel using
+ // scalar registers
+ //
+ adr_l x9, CTRINC // ... and ROT8
+ ld1 {v30.4s-v31.4s}, [x9]
+
+ // x0..15[0-3] = s0..3[0..3]
+ add x8, x0, #16
+ ld4r { v0.4s- v3.4s}, [x0]
+ ld4r { v4.4s- v7.4s}, [x8], #16
+ ld4r { v8.4s-v11.4s}, [x8], #16
+ ld4r {v12.4s-v15.4s}, [x8]
+
+ mov a0, v0.s[0]
+ mov a1, v1.s[0]
+ mov a2, v2.s[0]
+ mov a3, v3.s[0]
+ mov a4, v4.s[0]
+ mov a5, v5.s[0]
+ mov a6, v6.s[0]
+ mov a7, v7.s[0]
+ mov a8, v8.s[0]
+ mov a9, v9.s[0]
+ mov a10, v10.s[0]
+ mov a11, v11.s[0]
+ mov a12, v12.s[0]
+ mov a13, v13.s[0]
+ mov a14, v14.s[0]
+ mov a15, v15.s[0]
+
+ // x12 += counter values 1-4
+ add v12.4s, v12.4s, v30.4s
+
+.Ldoubleround4:
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ add v0.4s, v0.4s, v4.4s
+ add a0, a0, a4
+ add v1.4s, v1.4s, v5.4s
+ add a1, a1, a5
+ add v2.4s, v2.4s, v6.4s
+ add a2, a2, a6
+ add v3.4s, v3.4s, v7.4s
+ add a3, a3, a7
+
+ eor v12.16b, v12.16b, v0.16b
+ eor a12, a12, a0
+ eor v13.16b, v13.16b, v1.16b
+ eor a13, a13, a1
+ eor v14.16b, v14.16b, v2.16b
+ eor a14, a14, a2
+ eor v15.16b, v15.16b, v3.16b
+ eor a15, a15, a3
+
+ rev32 v12.8h, v12.8h
+ ror a12, a12, #16
+ rev32 v13.8h, v13.8h
+ ror a13, a13, #16
+ rev32 v14.8h, v14.8h
+ ror a14, a14, #16
+ rev32 v15.8h, v15.8h
+ ror a15, a15, #16
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ add v8.4s, v8.4s, v12.4s
+ add a8, a8, a12
+ add v9.4s, v9.4s, v13.4s
+ add a9, a9, a13
+ add v10.4s, v10.4s, v14.4s
+ add a10, a10, a14
+ add v11.4s, v11.4s, v15.4s
+ add a11, a11, a15
+
+ eor v16.16b, v4.16b, v8.16b
+ eor a4, a4, a8
+ eor v17.16b, v5.16b, v9.16b
+ eor a5, a5, a9
+ eor v18.16b, v6.16b, v10.16b
+ eor a6, a6, a10
+ eor v19.16b, v7.16b, v11.16b
+ eor a7, a7, a11
+
+ shl v4.4s, v16.4s, #12
+ shl v5.4s, v17.4s, #12
+ shl v6.4s, v18.4s, #12
+ shl v7.4s, v19.4s, #12
+
+ sri v4.4s, v16.4s, #20
+ ror a4, a4, #20
+ sri v5.4s, v17.4s, #20
+ ror a5, a5, #20
+ sri v6.4s, v18.4s, #20
+ ror a6, a6, #20
+ sri v7.4s, v19.4s, #20
+ ror a7, a7, #20
+
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ add v0.4s, v0.4s, v4.4s
+ add a0, a0, a4
+ add v1.4s, v1.4s, v5.4s
+ add a1, a1, a5
+ add v2.4s, v2.4s, v6.4s
+ add a2, a2, a6
+ add v3.4s, v3.4s, v7.4s
+ add a3, a3, a7
+
+ eor v12.16b, v12.16b, v0.16b
+ eor a12, a12, a0
+ eor v13.16b, v13.16b, v1.16b
+ eor a13, a13, a1
+ eor v14.16b, v14.16b, v2.16b
+ eor a14, a14, a2
+ eor v15.16b, v15.16b, v3.16b
+ eor a15, a15, a3
+
+ tbl v12.16b, {v12.16b}, v31.16b
+ ror a12, a12, #24
+ tbl v13.16b, {v13.16b}, v31.16b
+ ror a13, a13, #24
+ tbl v14.16b, {v14.16b}, v31.16b
+ ror a14, a14, #24
+ tbl v15.16b, {v15.16b}, v31.16b
+ ror a15, a15, #24
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ add v8.4s, v8.4s, v12.4s
+ add a8, a8, a12
+ add v9.4s, v9.4s, v13.4s
+ add a9, a9, a13
+ add v10.4s, v10.4s, v14.4s
+ add a10, a10, a14
+ add v11.4s, v11.4s, v15.4s
+ add a11, a11, a15
+
+ eor v16.16b, v4.16b, v8.16b
+ eor a4, a4, a8
+ eor v17.16b, v5.16b, v9.16b
+ eor a5, a5, a9
+ eor v18.16b, v6.16b, v10.16b
+ eor a6, a6, a10
+ eor v19.16b, v7.16b, v11.16b
+ eor a7, a7, a11
+
+ shl v4.4s, v16.4s, #7
+ shl v5.4s, v17.4s, #7
+ shl v6.4s, v18.4s, #7
+ shl v7.4s, v19.4s, #7
+
+ sri v4.4s, v16.4s, #25
+ ror a4, a4, #25
+ sri v5.4s, v17.4s, #25
+ ror a5, a5, #25
+ sri v6.4s, v18.4s, #25
+ ror a6, a6, #25
+ sri v7.4s, v19.4s, #25
+ ror a7, a7, #25
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ add v0.4s, v0.4s, v5.4s
+ add a0, a0, a5
+ add v1.4s, v1.4s, v6.4s
+ add a1, a1, a6
+ add v2.4s, v2.4s, v7.4s
+ add a2, a2, a7
+ add v3.4s, v3.4s, v4.4s
+ add a3, a3, a4
+
+ eor v15.16b, v15.16b, v0.16b
+ eor a15, a15, a0
+ eor v12.16b, v12.16b, v1.16b
+ eor a12, a12, a1
+ eor v13.16b, v13.16b, v2.16b
+ eor a13, a13, a2
+ eor v14.16b, v14.16b, v3.16b
+ eor a14, a14, a3
+
+ rev32 v15.8h, v15.8h
+ ror a15, a15, #16
+ rev32 v12.8h, v12.8h
+ ror a12, a12, #16
+ rev32 v13.8h, v13.8h
+ ror a13, a13, #16
+ rev32 v14.8h, v14.8h
+ ror a14, a14, #16
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ add v10.4s, v10.4s, v15.4s
+ add a10, a10, a15
+ add v11.4s, v11.4s, v12.4s
+ add a11, a11, a12
+ add v8.4s, v8.4s, v13.4s
+ add a8, a8, a13
+ add v9.4s, v9.4s, v14.4s
+ add a9, a9, a14
+
+ eor v16.16b, v5.16b, v10.16b
+ eor a5, a5, a10
+ eor v17.16b, v6.16b, v11.16b
+ eor a6, a6, a11
+ eor v18.16b, v7.16b, v8.16b
+ eor a7, a7, a8
+ eor v19.16b, v4.16b, v9.16b
+ eor a4, a4, a9
+
+ shl v5.4s, v16.4s, #12
+ shl v6.4s, v17.4s, #12
+ shl v7.4s, v18.4s, #12
+ shl v4.4s, v19.4s, #12
+
+ sri v5.4s, v16.4s, #20
+ ror a5, a5, #20
+ sri v6.4s, v17.4s, #20
+ ror a6, a6, #20
+ sri v7.4s, v18.4s, #20
+ ror a7, a7, #20
+ sri v4.4s, v19.4s, #20
+ ror a4, a4, #20
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ add v0.4s, v0.4s, v5.4s
+ add a0, a0, a5
+ add v1.4s, v1.4s, v6.4s
+ add a1, a1, a6
+ add v2.4s, v2.4s, v7.4s
+ add a2, a2, a7
+ add v3.4s, v3.4s, v4.4s
+ add a3, a3, a4
+
+ eor v15.16b, v15.16b, v0.16b
+ eor a15, a15, a0
+ eor v12.16b, v12.16b, v1.16b
+ eor a12, a12, a1
+ eor v13.16b, v13.16b, v2.16b
+ eor a13, a13, a2
+ eor v14.16b, v14.16b, v3.16b
+ eor a14, a14, a3
+
+ tbl v15.16b, {v15.16b}, v31.16b
+ ror a15, a15, #24
+ tbl v12.16b, {v12.16b}, v31.16b
+ ror a12, a12, #24
+ tbl v13.16b, {v13.16b}, v31.16b
+ ror a13, a13, #24
+ tbl v14.16b, {v14.16b}, v31.16b
+ ror a14, a14, #24
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ add v10.4s, v10.4s, v15.4s
+ add a10, a10, a15
+ add v11.4s, v11.4s, v12.4s
+ add a11, a11, a12
+ add v8.4s, v8.4s, v13.4s
+ add a8, a8, a13
+ add v9.4s, v9.4s, v14.4s
+ add a9, a9, a14
+
+ eor v16.16b, v5.16b, v10.16b
+ eor a5, a5, a10
+ eor v17.16b, v6.16b, v11.16b
+ eor a6, a6, a11
+ eor v18.16b, v7.16b, v8.16b
+ eor a7, a7, a8
+ eor v19.16b, v4.16b, v9.16b
+ eor a4, a4, a9
+
+ shl v5.4s, v16.4s, #7
+ shl v6.4s, v17.4s, #7
+ shl v7.4s, v18.4s, #7
+ shl v4.4s, v19.4s, #7
+
+ sri v5.4s, v16.4s, #25
+ ror a5, a5, #25
+ sri v6.4s, v17.4s, #25
+ ror a6, a6, #25
+ sri v7.4s, v18.4s, #25
+ ror a7, a7, #25
+ sri v4.4s, v19.4s, #25
+ ror a4, a4, #25
+
+ subs w3, w3, #2
+ b.ne .Ldoubleround4
+
+ ld4r {v16.4s-v19.4s}, [x0], #16
+ ld4r {v20.4s-v23.4s}, [x0], #16
+
+ // x12 += counter values 0-3
+ add v12.4s, v12.4s, v30.4s
+
+ // x0[0-3] += s0[0]
+ // x1[0-3] += s0[1]
+ // x2[0-3] += s0[2]
+ // x3[0-3] += s0[3]
+ add v0.4s, v0.4s, v16.4s
+ mov w6, v16.s[0]
+ mov w7, v17.s[0]
+ add v1.4s, v1.4s, v17.4s
+ mov w8, v18.s[0]
+ mov w9, v19.s[0]
+ add v2.4s, v2.4s, v18.4s
+ add a0, a0, w6
+ add a1, a1, w7
+ add v3.4s, v3.4s, v19.4s
+ add a2, a2, w8
+ add a3, a3, w9
+CPU_BE( rev a0, a0 )
+CPU_BE( rev a1, a1 )
+CPU_BE( rev a2, a2 )
+CPU_BE( rev a3, a3 )
+
+ ld4r {v24.4s-v27.4s}, [x0], #16
+ ld4r {v28.4s-v31.4s}, [x0]
+
+ // x4[0-3] += s1[0]
+ // x5[0-3] += s1[1]
+ // x6[0-3] += s1[2]
+ // x7[0-3] += s1[3]
+ add v4.4s, v4.4s, v20.4s
+ mov w6, v20.s[0]
+ mov w7, v21.s[0]
+ add v5.4s, v5.4s, v21.4s
+ mov w8, v22.s[0]
+ mov w9, v23.s[0]
+ add v6.4s, v6.4s, v22.4s
+ add a4, a4, w6
+ add a5, a5, w7
+ add v7.4s, v7.4s, v23.4s
+ add a6, a6, w8
+ add a7, a7, w9
+CPU_BE( rev a4, a4 )
+CPU_BE( rev a5, a5 )
+CPU_BE( rev a6, a6 )
+CPU_BE( rev a7, a7 )
+
+ // x8[0-3] += s2[0]
+ // x9[0-3] += s2[1]
+ // x10[0-3] += s2[2]
+ // x11[0-3] += s2[3]
+ add v8.4s, v8.4s, v24.4s
+ mov w6, v24.s[0]
+ mov w7, v25.s[0]
+ add v9.4s, v9.4s, v25.4s
+ mov w8, v26.s[0]
+ mov w9, v27.s[0]
+ add v10.4s, v10.4s, v26.4s
+ add a8, a8, w6
+ add a9, a9, w7
+ add v11.4s, v11.4s, v27.4s
+ add a10, a10, w8
+ add a11, a11, w9
+CPU_BE( rev a8, a8 )
+CPU_BE( rev a9, a9 )
+CPU_BE( rev a10, a10 )
+CPU_BE( rev a11, a11 )
+
+ // x12[0-3] += s3[0]
+ // x13[0-3] += s3[1]
+ // x14[0-3] += s3[2]
+ // x15[0-3] += s3[3]
+ add v12.4s, v12.4s, v28.4s
+ mov w6, v28.s[0]
+ mov w7, v29.s[0]
+ add v13.4s, v13.4s, v29.4s
+ mov w8, v30.s[0]
+ mov w9, v31.s[0]
+ add v14.4s, v14.4s, v30.4s
+ add a12, a12, w6
+ add a13, a13, w7
+ add v15.4s, v15.4s, v31.4s
+ add a14, a14, w8
+ add a15, a15, w9
+CPU_BE( rev a12, a12 )
+CPU_BE( rev a13, a13 )
+CPU_BE( rev a14, a14 )
+CPU_BE( rev a15, a15 )
+
+ // interleave 32-bit words in state n, n+1
+ ldp w6, w7, [x2], #64
+ zip1 v16.4s, v0.4s, v1.4s
+ ldp w8, w9, [x2, #-56]
+ eor a0, a0, w6
+ zip2 v17.4s, v0.4s, v1.4s
+ eor a1, a1, w7
+ zip1 v18.4s, v2.4s, v3.4s
+ eor a2, a2, w8
+ zip2 v19.4s, v2.4s, v3.4s
+ eor a3, a3, w9
+ ldp w6, w7, [x2, #-48]
+ zip1 v20.4s, v4.4s, v5.4s
+ ldp w8, w9, [x2, #-40]
+ eor a4, a4, w6
+ zip2 v21.4s, v4.4s, v5.4s
+ eor a5, a5, w7
+ zip1 v22.4s, v6.4s, v7.4s
+ eor a6, a6, w8
+ zip2 v23.4s, v6.4s, v7.4s
+ eor a7, a7, w9
+ ldp w6, w7, [x2, #-32]
+ zip1 v24.4s, v8.4s, v9.4s
+ ldp w8, w9, [x2, #-24]
+ eor a8, a8, w6
+ zip2 v25.4s, v8.4s, v9.4s
+ eor a9, a9, w7
+ zip1 v26.4s, v10.4s, v11.4s
+ eor a10, a10, w8
+ zip2 v27.4s, v10.4s, v11.4s
+ eor a11, a11, w9
+ ldp w6, w7, [x2, #-16]
+ zip1 v28.4s, v12.4s, v13.4s
+ ldp w8, w9, [x2, #-8]
+ eor a12, a12, w6
+ zip2 v29.4s, v12.4s, v13.4s
+ eor a13, a13, w7
+ zip1 v30.4s, v14.4s, v15.4s
+ eor a14, a14, w8
+ zip2 v31.4s, v14.4s, v15.4s
+ eor a15, a15, w9
+
+ add x3, x2, x4
+ sub x3, x3, #128 // start of last block
+
+ subs x5, x4, #128
+ csel x2, x2, x3, ge
+
+ // interleave 64-bit words in state n, n+2
+ zip1 v0.2d, v16.2d, v18.2d
+ zip2 v4.2d, v16.2d, v18.2d
+ stp a0, a1, [x1], #64
+ zip1 v8.2d, v17.2d, v19.2d
+ zip2 v12.2d, v17.2d, v19.2d
+ stp a2, a3, [x1, #-56]
+
+ subs x6, x4, #192
+ ld1 {v16.16b-v19.16b}, [x2], #64
+ csel x2, x2, x3, ge
+
+ zip1 v1.2d, v20.2d, v22.2d
+ zip2 v5.2d, v20.2d, v22.2d
+ stp a4, a5, [x1, #-48]
+ zip1 v9.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+ stp a6, a7, [x1, #-40]
+
+ subs x7, x4, #256
+ ld1 {v20.16b-v23.16b}, [x2], #64
+ csel x2, x2, x3, ge
+
+ zip1 v2.2d, v24.2d, v26.2d
+ zip2 v6.2d, v24.2d, v26.2d
+ stp a8, a9, [x1, #-32]
+ zip1 v10.2d, v25.2d, v27.2d
+ zip2 v14.2d, v25.2d, v27.2d
+ stp a10, a11, [x1, #-24]
+
+ subs x8, x4, #320
+ ld1 {v24.16b-v27.16b}, [x2], #64
+ csel x2, x2, x3, ge
+
+ zip1 v3.2d, v28.2d, v30.2d
+ zip2 v7.2d, v28.2d, v30.2d
+ stp a12, a13, [x1, #-16]
+ zip1 v11.2d, v29.2d, v31.2d
+ zip2 v15.2d, v29.2d, v31.2d
+ stp a14, a15, [x1, #-8]
+
+ tbnz x5, #63, .Lt128
+ ld1 {v28.16b-v31.16b}, [x2]
+
+ // xor with corresponding input, write to output
+ eor v16.16b, v16.16b, v0.16b
+ eor v17.16b, v17.16b, v1.16b
+ eor v18.16b, v18.16b, v2.16b
+ eor v19.16b, v19.16b, v3.16b
+
+ tbnz x6, #63, .Lt192
+
+ eor v20.16b, v20.16b, v4.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v6.16b
+ eor v23.16b, v23.16b, v7.16b
+
+ st1 {v16.16b-v19.16b}, [x1], #64
+ tbnz x7, #63, .Lt256
+
+ eor v24.16b, v24.16b, v8.16b
+ eor v25.16b, v25.16b, v9.16b
+ eor v26.16b, v26.16b, v10.16b
+ eor v27.16b, v27.16b, v11.16b
+
+ st1 {v20.16b-v23.16b}, [x1], #64
+ tbnz x8, #63, .Lt320
+
+ eor v28.16b, v28.16b, v12.16b
+ eor v29.16b, v29.16b, v13.16b
+ eor v30.16b, v30.16b, v14.16b
+ eor v31.16b, v31.16b, v15.16b
+
+ st1 {v24.16b-v27.16b}, [x1], #64
+ st1 {v28.16b-v31.16b}, [x1]
+
+.Lout: frame_pop
+ ret
+
+ // fewer than 192 bytes of in/output
+.Lt192: cbz x5, 1f // exactly 128 bytes?
+ ld1 {v28.16b-v31.16b}, [x10]
+ add x5, x5, x1
+ tbl v28.16b, {v4.16b-v7.16b}, v28.16b
+ tbl v29.16b, {v4.16b-v7.16b}, v29.16b
+ tbl v30.16b, {v4.16b-v7.16b}, v30.16b
+ tbl v31.16b, {v4.16b-v7.16b}, v31.16b
+
+0: eor v20.16b, v20.16b, v28.16b
+ eor v21.16b, v21.16b, v29.16b
+ eor v22.16b, v22.16b, v30.16b
+ eor v23.16b, v23.16b, v31.16b
+ st1 {v20.16b-v23.16b}, [x5] // overlapping stores
+1: st1 {v16.16b-v19.16b}, [x1]
+ b .Lout
+
+ // fewer than 128 bytes of in/output
+.Lt128: ld1 {v28.16b-v31.16b}, [x10]
+ add x5, x5, x1
+ sub x1, x1, #64
+ tbl v28.16b, {v0.16b-v3.16b}, v28.16b
+ tbl v29.16b, {v0.16b-v3.16b}, v29.16b
+ tbl v30.16b, {v0.16b-v3.16b}, v30.16b
+ tbl v31.16b, {v0.16b-v3.16b}, v31.16b
+ ld1 {v16.16b-v19.16b}, [x1] // reload first output block
+ b 0b
+
+ // fewer than 256 bytes of in/output
+.Lt256: cbz x6, 2f // exactly 192 bytes?
+ ld1 {v4.16b-v7.16b}, [x10]
+ add x6, x6, x1
+ tbl v0.16b, {v8.16b-v11.16b}, v4.16b
+ tbl v1.16b, {v8.16b-v11.16b}, v5.16b
+ tbl v2.16b, {v8.16b-v11.16b}, v6.16b
+ tbl v3.16b, {v8.16b-v11.16b}, v7.16b
+
+ eor v28.16b, v28.16b, v0.16b
+ eor v29.16b, v29.16b, v1.16b
+ eor v30.16b, v30.16b, v2.16b
+ eor v31.16b, v31.16b, v3.16b
+ st1 {v28.16b-v31.16b}, [x6] // overlapping stores
+2: st1 {v20.16b-v23.16b}, [x1]
+ b .Lout
+
+ // fewer than 320 bytes of in/output
+.Lt320: cbz x7, 3f // exactly 256 bytes?
+ ld1 {v4.16b-v7.16b}, [x10]
+ add x7, x7, x1
+ tbl v0.16b, {v12.16b-v15.16b}, v4.16b
+ tbl v1.16b, {v12.16b-v15.16b}, v5.16b
+ tbl v2.16b, {v12.16b-v15.16b}, v6.16b
+ tbl v3.16b, {v12.16b-v15.16b}, v7.16b
+
+ eor v28.16b, v28.16b, v0.16b
+ eor v29.16b, v29.16b, v1.16b
+ eor v30.16b, v30.16b, v2.16b
+ eor v31.16b, v31.16b, v3.16b
+ st1 {v28.16b-v31.16b}, [x7] // overlapping stores
+3: st1 {v24.16b-v27.16b}, [x1]
+ b .Lout
+SYM_FUNC_END(chacha_4block_xor_neon)
+
+ .section ".rodata", "a", %progbits
+ .align L1_CACHE_SHIFT
+.Lpermute:
+ .set .Li, 0
+ .rept 128
+ .byte (.Li - 64)
+ .set .Li, .Li + 1
+ .endr
+
+CTRINC: .word 1, 2, 3, 4
+ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/lib/crypto/arm64/chacha.h b/lib/crypto/arm64/chacha.h
new file mode 100644
index 000000000000..ca8c6a8b0578
--- /dev/null
+++ b/lib/crypto/arm64/chacha.h
@@ -0,0 +1,96 @@
+/*
+ * ChaCha and HChaCha functions (ARM64 optimized)
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+ u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ int nrounds, int bytes);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+ int bytes, int nrounds)
+{
+ while (bytes > 0) {
+ int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+ if (l <= CHACHA_BLOCK_SIZE) {
+ u8 buf[CHACHA_BLOCK_SIZE];
+
+ memcpy(buf, src, l);
+ chacha_block_xor_neon(state, buf, buf, nrounds);
+ memcpy(dst, buf, l);
+ state->x[12] += 1;
+ break;
+ }
+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+ }
+}
+
+static void hchacha_block_arch(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+ hchacha_block_generic(state, out, nrounds);
+ } else {
+ scoped_ksimd()
+ hchacha_block_neon(state, out, nrounds);
+ }
+}
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+ !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ scoped_ksimd()
+ chacha_doneon(state, dst, src, todo, nrounds);
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(ASIMD))
+ static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl
new file mode 100644
index 000000000000..f1930c6b55ce
--- /dev/null
+++ b/lib/crypto/arm64/poly1305-armv8.pl
@@ -0,0 +1,920 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc-4.9 NEON
+#
+# Apple A7 1.86/+5% 0.72
+# Cortex-A53 2.69/+58% 1.47
+# Cortex-A57 2.70/+7% 1.14
+# Denver 1.64/+50% 1.18(*)
+# X-Gene 2.13/+68% 2.27
+# Mongoose 1.77/+75% 1.12
+# Kryo 2.70/+55% 1.13
+# ThunderX2 1.17/+95% 1.36
+#
+# (*) estimate based on resources availability is less than 1.0,
+# i.e. measured result is worse than expected, presumably binary
+# translator is not almighty;
+
+$flavour=shift;
+$output=shift;
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#else
+# define poly1305_init poly1305_block_init
+# define poly1305_blocks poly1305_blocks_arm64
+#endif
+
+.text
+
+// forward "declarations" are required for Apple
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+ cmp $inp,xzr
+ stp xzr,xzr,[$ctx] // zero hash value
+ stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armcap_P
+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
+#endif
+
+ ldp $r0,$r1,[$inp] // load key
+ mov $s1,#0xfffffffc0fffffff
+ movk $s1,#0x0fff,lsl#48
+#ifdef __AARCH64EB__
+ rev $r0,$r0 // flip bytes
+ rev $r1,$r1
+#endif
+ and $r0,$r0,$s1 // &=0ffffffc0fffffff
+ and $s1,$s1,#-4
+ and $r1,$r1,$s1 // &=0ffffffc0ffffffc
+ mov w#$s1,#-1
+ stp $r0,$r1,[$ctx,#32] // save key value
+ str w#$s1,[$ctx,#48] // impossible key power value
+
+#ifndef __KERNEL__
+ tst w17,#ARMV7_NEON
+
+ adr $d0,.Lpoly1305_blocks
+ adr $r0,.Lpoly1305_blocks_neon
+ adr $d1,.Lpoly1305_emit
+
+ csel $d0,$d0,$r0,eq
+
+# ifdef __ILP32__
+ stp w#$d0,w#$d1,[$len]
+# else
+ stp $d0,$d1,[$len]
+# endif
+#endif
+ mov x0,#1
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ ands $len,$len,#-16
+ b.eq .Lno_data
+
+ ldp $h0,$h1,[$ctx] // load hash value
+ ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+#ifdef __AARCH64EB__
+ lsr $d0,$h0,#32
+ mov w#$d1,w#$h0
+ lsr $d2,$h1,#32
+ mov w15,w#$h1
+ lsr x16,$h2,#32
+#else
+ mov w#$d0,w#$h0
+ lsr $d1,$h0,#32
+ mov w#$d2,w#$h1
+ lsr x15,$h1,#32
+ mov w16,w#$h2
+#endif
+
+ add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
+ lsr $d1,$d2,#12
+ adds $d0,$d0,$d2,lsl#52
+ add $d1,$d1,x15,lsl#14
+ adc $d1,$d1,xzr
+ lsr $d2,x16,#24
+ adds $d1,$d1,x16,lsl#40
+ adc $d2,$d2,xzr
+
+ cmp x17,#0 // is_base2_26?
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ csel $h0,$h0,$d0,eq // choose between radixes
+ csel $h1,$h1,$d1,eq
+ csel $h2,$h2,$d2,eq
+
+.Loop:
+ ldp $t0,$t1,[$inp],#16 // load input
+ sub $len,$len,#16
+#ifdef __AARCH64EB__
+ rev $t0,$t0
+ rev $t1,$t1
+#endif
+ adds $h0,$h0,$t0 // accumulate input
+ adcs $h1,$h1,$t1
+
+ mul $d0,$h0,$r0 // h0*r0
+ adc $h2,$h2,$padbit
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ cbnz $len,.Loop
+
+ stp $h0,$h1,[$ctx] // store hash value
+ stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+ ldp $h0,$h1,[$ctx] // load hash base 2^64
+ ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
+ ldp $t0,$t1,[$nonce] // load nonce
+
+#ifdef __AARCH64EB__
+ lsr $d0,$h0,#32
+ mov w#$d1,w#$h0
+ lsr $d2,$h1,#32
+ mov w15,w#$h1
+ lsr x16,$h2,#32
+#else
+ mov w#$d0,w#$h0
+ lsr $d1,$h0,#32
+ mov w#$d2,w#$h1
+ lsr x15,$h1,#32
+ mov w16,w#$h2
+#endif
+
+ add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
+ lsr $d1,$d2,#12
+ adds $d0,$d0,$d2,lsl#52
+ add $d1,$d1,x15,lsl#14
+ adc $d1,$d1,xzr
+ lsr $d2,x16,#24
+ adds $d1,$d1,x16,lsl#40
+ adc $d2,$d2,xzr
+
+ cmp $r0,#0 // is_base2_26?
+ csel $h0,$h0,$d0,eq // choose between radixes
+ csel $h1,$h1,$d1,eq
+ csel $h2,$h2,$d2,eq
+
+ adds $d0,$h0,#5 // compare to modulus
+ adcs $d1,$h1,xzr
+ adc $d2,$h2,xzr
+
+ tst $d2,#-4 // see if it's carried/borrowed
+
+ csel $h0,$h0,$d0,eq
+ csel $h1,$h1,$d1,eq
+
+#ifdef __AARCH64EB__
+ ror $t0,$t0,#32 // flip nonce words
+ ror $t1,$t1,#32
+#endif
+ adds $h0,$h0,$t0 // accumulate nonce
+ adc $h1,$h1,$t1
+#ifdef __AARCH64EB__
+ rev $h0,$h0 // flip output bytes
+ rev $h1,$h1
+#endif
+ stp $h0,$h1,[$mac] // write result
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros; # borrow
+
+$code.=<<___;
+.type poly1305_mult,%function
+.align 5
+poly1305_mult:
+ mul $d0,$h0,$r0 // h0*r0
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ ret
+.size poly1305_mult,.-poly1305_mult
+
+.type poly1305_splat,%function
+.align 4
+poly1305_splat:
+ and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,$h0,#26,#26
+ extr x14,$h1,$h0,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,$h1,#14,#26
+ extr x16,$h2,$h1,#40
+
+ str w12,[$ctx,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[$ctx,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[$ctx,#16*2] // s1
+ str w14,[$ctx,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[$ctx,#16*4] // s2
+ str w15,[$ctx,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[$ctx,#16*6] // s3
+ str w16,[$ctx,#16*7] // r4
+ str w15,[$ctx,#16*8] // s4
+
+ ret
+.size poly1305_splat,.-poly1305_splat
+
+#ifdef __KERNEL__
+.globl poly1305_blocks_neon
+#endif
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+ ldr $is_base2_26,[$ctx,#24]
+ cmp $len,#128
+ b.lo .Lpoly1305_blocks
+
+ .inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ cbz $is_base2_26,.Lbase2_64_neon
+
+ ldp w10,w11,[$ctx] // load hash value base 2^26
+ ldp w12,w13,[$ctx,#8]
+ ldr w14,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Leven_neon
+
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr $h1,x12,#12
+ adds $h0,$h0,x12,lsl#52
+ add $h1,$h1,x13,lsl#14
+ adc $h1,$h1,xzr
+ lsr $h2,x14,#24
+ adds $h1,$h1,x14,lsl#40
+ adc $d2,$h2,xzr // can be partially reduced...
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ b .Leven_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ ldp $h0,$h1,[$ctx] // load hash value base 2^64
+ ldr $h2,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Linit_neon
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+.Linit_neon:
+ ldr w17,[$ctx,#48] // first table element
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ cmp w17,#-1 // is value impossible?
+ b.ne .Leven_neon
+
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+ ////////////////////////////////// initialize r^n table
+ mov $h0,$r0 // r^1
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov $h1,$r1
+ mov $h2,xzr
+ add $ctx,$ctx,#48+12
+ bl poly1305_splat
+
+ bl poly1305_mult // r^2
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^3
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^4
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+ sub $ctx,$ctx,#48 // restore original $ctx
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+.Ldo_neon:
+ ldp x8,x12,[$inp,#32] // inp[2:3]
+ subs $len,$len,#64
+ ldp x9,x13,[$inp,#48]
+ add $in2,$inp,#96
+ adrp $zeros,.Lzeros
+ add $zeros,$zeros,#:lo12:.Lzeros
+
+ lsl $padbit,$padbit,#24
+ add x15,$ctx,#48
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN23_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN23_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov $IN23_2,x8
+ fmov $IN23_3,x10
+ fmov $IN23_4,x12
+
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ ldp x9,x13,[$inp],#48
+
+ ld1 {$R0,$R1,$S1,$R2},[x15],#64
+ ld1 {$S2,$R3,$S3,$R4},[x15],#64
+ ld1 {$S4},[x15]
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN01_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN01_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi $MASK.2d,#-1
+ fmov $IN01_2,x8
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+ ushr $MASK.2d,$MASK.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // \___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // \___________________/ \____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs $len,$len,#64
+ umull $ACC4,$IN23_0,${R4}[2]
+ csel $in2,$zeros,$in2,lo
+ umull $ACC3,$IN23_0,${R3}[2]
+ umull $ACC2,$IN23_0,${R2}[2]
+ ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
+ umull $ACC1,$IN23_0,${R1}[2]
+ ldp x9,x13,[$in2],#48
+ umull $ACC0,$IN23_0,${R0}[2]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal $ACC4,$IN23_1,${R3}[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC3,$IN23_1,${R2}[2]
+ and x5,x9,#0x03ffffff
+ umlal $ACC2,$IN23_1,${R1}[2]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN23_1,${R0}[2]
+ ubfx x7,x9,#26,#26
+ umlal $ACC0,$IN23_1,${S4}[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal $ACC4,$IN23_2,${R2}[2]
+ extr x8,x12,x8,#52
+ umlal $ACC3,$IN23_2,${R1}[2]
+ extr x9,x13,x9,#52
+ umlal $ACC2,$IN23_2,${R0}[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC1,$IN23_2,${S4}[2]
+ fmov $IN23_0,x4
+ umlal $ACC0,$IN23_2,${S3}[2]
+ and x8,x8,#0x03ffffff
+
+ umlal $ACC4,$IN23_3,${R1}[2]
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN23_3,${R0}[2]
+ ubfx x10,x12,#14,#26
+ umlal $ACC2,$IN23_3,${S4}[2]
+ ubfx x11,x13,#14,#26
+ umlal $ACC1,$IN23_3,${S3}[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC0,$IN23_3,${S2}[2]
+ fmov $IN23_1,x6
+
+ add $IN01_2,$IN01_2,$H2
+ add x12,$padbit,x12,lsr#40
+ umlal $ACC4,$IN23_4,${R0}[2]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC3,$IN23_4,${S4}[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC2,$IN23_4,${S3}[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN23_4,${S2}[2]
+ fmov $IN23_2,x8
+ umlal $ACC0,$IN23_4,${S1}[2]
+ fmov $IN23_3,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ fmov $IN23_4,x12
+ umlal $ACC3,$IN01_2,${R1}[0]
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ umlal $ACC0,$IN01_2,${S3}[0]
+ ldp x9,x13,[$inp],#48
+ umlal $ACC4,$IN01_2,${R2}[0]
+ umlal $ACC1,$IN01_2,${S4}[0]
+ umlal $ACC2,$IN01_2,${R0}[0]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}[0]
+ umlal $ACC4,$IN01_0,${R4}[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC2,$IN01_0,${R2}[0]
+ and x5,x9,#0x03ffffff
+ umlal $ACC0,$IN01_0,${R0}[0]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN01_0,${R1}[0]
+ ubfx x7,x9,#26,#26
+
+ add $IN01_3,$IN01_3,$H3
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal $ACC3,$IN01_1,${R2}[0]
+ extr x8,x12,x8,#52
+ umlal $ACC4,$IN01_1,${R3}[0]
+ extr x9,x13,x9,#52
+ umlal $ACC0,$IN01_1,${S4}[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC2,$IN01_1,${R1}[0]
+ fmov $IN01_0,x4
+ umlal $ACC1,$IN01_1,${R0}[0]
+ and x8,x8,#0x03ffffff
+
+ add $IN01_4,$IN01_4,$H4
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN01_3,${R0}[0]
+ ubfx x10,x12,#14,#26
+ umlal $ACC0,$IN01_3,${S2}[0]
+ ubfx x11,x13,#14,#26
+ umlal $ACC4,$IN01_3,${R1}[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC1,$IN01_3,${S3}[0]
+ fmov $IN01_1,x6
+ umlal $ACC2,$IN01_3,${S4}[0]
+ add x12,$padbit,x12,lsr#40
+
+ umlal $ACC3,$IN01_4,${S4}[0]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC0,$IN01_4,${S1}[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC4,$IN01_4,${R0}[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN01_4,${S2}[0]
+ fmov $IN01_2,x8
+ umlal $ACC2,$IN01_4,${S3}[0]
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr $T0.2d,$ACC3,#26
+ xtn $H3,$ACC3
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ bic $H3,#0xfc,lsl#24 // &=0x03ffffff
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ xtn $H4,$ACC4
+ ushr $T1.2d,$ACC1,#26
+ xtn $H1,$ACC1
+ bic $H4,#0xfc,lsl#24
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ shrn $T1.2s,$ACC2,#26
+ xtn $H2,$ACC2
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ bic $H1,#0xfc,lsl#24
+ add $H3,$H3,$T1.2s // h2 -> h3
+ bic $H2,#0xfc,lsl#24
+
+ shrn $T0.2s,$ACC0,#26
+ xtn $H0,$ACC0
+ ushr $T1.2s,$H3,#26
+ bic $H3,#0xfc,lsl#24
+ bic $H0,#0xfc,lsl#24
+ add $H1,$H1,$T0.2s // h0 -> h1
+ add $H4,$H4,$T1.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup $IN23_2,${IN23_2}[0]
+ add $IN01_2,$IN01_2,$H2
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds $len,$len,#32
+ b.ne .Long_tail
+
+ dup $IN23_2,${IN01_2}[0]
+ add $IN23_0,$IN01_0,$H0
+ add $IN23_3,$IN01_3,$H3
+ add $IN23_1,$IN01_1,$H1
+ add $IN23_4,$IN01_4,$H4
+
+.Long_tail:
+ dup $IN23_0,${IN23_0}[0]
+ umull2 $ACC0,$IN23_2,${S3}
+ umull2 $ACC3,$IN23_2,${R1}
+ umull2 $ACC4,$IN23_2,${R2}
+ umull2 $ACC2,$IN23_2,${R0}
+ umull2 $ACC1,$IN23_2,${S4}
+
+ dup $IN23_1,${IN23_1}[0]
+ umlal2 $ACC0,$IN23_0,${R0}
+ umlal2 $ACC2,$IN23_0,${R2}
+ umlal2 $ACC3,$IN23_0,${R3}
+ umlal2 $ACC4,$IN23_0,${R4}
+ umlal2 $ACC1,$IN23_0,${R1}
+
+ dup $IN23_3,${IN23_3}[0]
+ umlal2 $ACC0,$IN23_1,${S4}
+ umlal2 $ACC3,$IN23_1,${R2}
+ umlal2 $ACC2,$IN23_1,${R1}
+ umlal2 $ACC4,$IN23_1,${R3}
+ umlal2 $ACC1,$IN23_1,${R0}
+
+ dup $IN23_4,${IN23_4}[0]
+ umlal2 $ACC3,$IN23_3,${R0}
+ umlal2 $ACC4,$IN23_3,${R1}
+ umlal2 $ACC0,$IN23_3,${S2}
+ umlal2 $ACC1,$IN23_3,${S3}
+ umlal2 $ACC2,$IN23_3,${S4}
+
+ umlal2 $ACC3,$IN23_4,${S4}
+ umlal2 $ACC0,$IN23_4,${S1}
+ umlal2 $ACC4,$IN23_4,${R0}
+ umlal2 $ACC1,$IN23_4,${S2}
+ umlal2 $ACC2,$IN23_4,${S3}
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ umlal $ACC3,$IN01_2,${R1}
+ umlal $ACC0,$IN01_2,${S3}
+ umlal $ACC4,$IN01_2,${R2}
+ umlal $ACC1,$IN01_2,${S4}
+ umlal $ACC2,$IN01_2,${R0}
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}
+ umlal $ACC0,$IN01_0,${R0}
+ umlal $ACC4,$IN01_0,${R4}
+ umlal $ACC1,$IN01_0,${R1}
+ umlal $ACC2,$IN01_0,${R2}
+
+ add $IN01_3,$IN01_3,$H3
+ umlal $ACC3,$IN01_1,${R2}
+ umlal $ACC0,$IN01_1,${S4}
+ umlal $ACC4,$IN01_1,${R3}
+ umlal $ACC1,$IN01_1,${R0}
+ umlal $ACC2,$IN01_1,${R1}
+
+ add $IN01_4,$IN01_4,$H4
+ umlal $ACC3,$IN01_3,${R0}
+ umlal $ACC0,$IN01_3,${S2}
+ umlal $ACC4,$IN01_3,${R1}
+ umlal $ACC1,$IN01_3,${S3}
+ umlal $ACC2,$IN01_3,${S4}
+
+ umlal $ACC3,$IN01_4,${S4}
+ umlal $ACC0,$IN01_4,${S1}
+ umlal $ACC4,$IN01_4,${R0}
+ umlal $ACC1,$IN01_4,${S2}
+ umlal $ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp $ACC3,$ACC3,$ACC3
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp $ACC0,$ACC0,$ACC0
+ ldp d10,d11,[sp,#32]
+ addp $ACC4,$ACC4,$ACC4
+ ldp d12,d13,[sp,#48]
+ addp $ACC1,$ACC1,$ACC1
+ ldp d14,d15,[sp,#64]
+ addp $ACC2,$ACC2,$ACC2
+ ldr x30,[sp,#8]
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr $T0.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ and $ACC4,$ACC4,$MASK.2d
+ ushr $T1.2d,$ACC1,#26
+ and $ACC1,$ACC1,$MASK.2d
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ ushr $T1.2d,$ACC2,#26
+ and $ACC2,$ACC2,$MASK.2d
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ add $ACC3,$ACC3,$T1.2d // h2 -> h3
+
+ ushr $T0.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ ushr $T1.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ add $ACC1,$ACC1,$T0.2d // h0 -> h1
+ add $ACC4,$ACC4,$T1.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+ mov x4,#1
+ st1 {$ACC4}[0],[$ctx]
+ str x4,[$ctx,#8] // set is_base2_26
+
+ ldr x29,[sp],#80
+ .inst 0xd50323bf // autiasp
+ ret
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.pushsection .rodata
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
+.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
+.popsection
+
+.align 2
+#if !defined(__KERNEL__) && !defined(_WIN64)
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+___
+
+foreach (split("\n",$code)) {
+ s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
+ s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
+ (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
+ (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
+ (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
+ (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
+ (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+ s/\.[124]([sd])\[/.$1\[/;
+ s/w#x([0-9]+)/w$1/g;
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/lib/crypto/arm64/poly1305.h b/lib/crypto/arm64/poly1305.h
new file mode 100644
index 000000000000..b77669767cd6
--- /dev/null
+++ b/lib/crypto/arm64/poly1305.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_arm64(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+ u8 digest[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+ unsigned int len, u32 padbit)
+{
+ if (static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ scoped_ksimd()
+ poly1305_blocks_neon(state, src, todo, padbit);
+
+ len -= todo;
+ src += todo;
+ } while (len);
+ } else
+ poly1305_blocks_arm64(state, src, len, padbit);
+}
+
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(ASIMD))
+ static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm64/polyval-ce-core.S b/lib/crypto/arm64/polyval-ce-core.S
new file mode 100644
index 000000000000..7c731a044d02
--- /dev/null
+++ b/lib/crypto/arm64/polyval-ce-core.S
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Implementation of POLYVAL using ARMv8 Crypto Extensions.
+ *
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
+ * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
+ * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
+ * finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#define STRIDE_BLOCKS 8
+
+ACCUMULATOR .req x0
+KEY_POWERS .req x1
+MSG .req x2
+BLOCKS_LEFT .req x3
+KEY_START .req x10
+EXTRA_BYTES .req x11
+TMP .req x13
+
+M0 .req v0
+M1 .req v1
+M2 .req v2
+M3 .req v3
+M4 .req v4
+M5 .req v5
+M6 .req v6
+M7 .req v7
+KEY8 .req v8
+KEY7 .req v9
+KEY6 .req v10
+KEY5 .req v11
+KEY4 .req v12
+KEY3 .req v13
+KEY2 .req v14
+KEY1 .req v15
+PL .req v16
+PH .req v17
+TMP_V .req v18
+LO .req v20
+MI .req v21
+HI .req v22
+SUM .req v23
+GSTAR .req v24
+
+ .text
+
+ .arch armv8-a+crypto
+ .align 4
+
+.Lgstar:
+ .quad 0xc200000000000000, 0xc200000000000000
+
+/*
+ * Computes the product of two 128-bit polynomials in X and Y and XORs the
+ * components of the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ * X = [X_1 : X_0]
+ * Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ * LO += X_0 * Y_0
+ * MI += (X_0 + X_1) * (Y_0 + Y_1)
+ * HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * Karatsuba multiplication is used instead of Schoolbook multiplication because
+ * it was found to be slightly faster on ARM64 CPUs.
+ *
+ */
+.macro karatsuba1 X Y
+ X .req \X
+ Y .req \Y
+ ext v25.16b, X.16b, X.16b, #8
+ ext v26.16b, Y.16b, Y.16b, #8
+ eor v25.16b, v25.16b, X.16b
+ eor v26.16b, v26.16b, Y.16b
+ pmull2 v28.1q, X.2d, Y.2d
+ pmull v29.1q, X.1d, Y.1d
+ pmull v27.1q, v25.1d, v26.1d
+ eor HI.16b, HI.16b, v28.16b
+ eor LO.16b, LO.16b, v29.16b
+ eor MI.16b, MI.16b, v27.16b
+ .unreq X
+ .unreq Y
+.endm
+
+/*
+ * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
+ * them.
+ */
+.macro karatsuba1_store X Y
+ X .req \X
+ Y .req \Y
+ ext v25.16b, X.16b, X.16b, #8
+ ext v26.16b, Y.16b, Y.16b, #8
+ eor v25.16b, v25.16b, X.16b
+ eor v26.16b, v26.16b, Y.16b
+ pmull2 HI.1q, X.2d, Y.2d
+ pmull LO.1q, X.1d, Y.1d
+ pmull MI.1q, v25.1d, v26.1d
+ .unreq X
+ .unreq Y
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] =
+ * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+ */
+.macro karatsuba2
+ // v4 = [HI_1 + MI_1 : HI_0 + MI_0]
+ eor v4.16b, HI.16b, MI.16b
+ // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
+ eor v4.16b, v4.16b, LO.16b
+ // v5 = [HI_0 : LO_1]
+ ext v5.16b, LO.16b, HI.16b, #8
+ // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
+ eor v4.16b, v4.16b, v5.16b
+ // HI = [HI_0 : HI_1]
+ ext HI.16b, HI.16b, HI.16b, #8
+ // LO = [LO_0 : LO_1]
+ ext LO.16b, LO.16b, LO.16b, #8
+ // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
+ ext PH.16b, v4.16b, HI.16b, #8
+ // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+ ext PL.16b, LO.16b, v4.16b, #8
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form. We need to reduce it
+ * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128. To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right
+ * shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ * T = T_1 : T_0 = g*(x) * P_0
+ * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+ DEST .req \dest
+ // TMP_V = T_1 : T_0 = P_0 * g*(x)
+ pmull TMP_V.1q, PL.1d, GSTAR.1d
+ // TMP_V = T_0 : T_1
+ ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+ // TMP_V = P_1 + T_0 : P_0 + T_1
+ eor TMP_V.16b, PL.16b, TMP_V.16b
+ // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+ eor PH.16b, PH.16b, TMP_V.16b
+ // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
+ pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
+ eor DEST.16b, PH.16b, TMP_V.16b
+ .unreq DEST
+.endm
+
+/*
+ * Compute Polyval on 8 blocks.
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ *
+ * Sets PL, PH.
+ */
+.macro full_stride reduce
+ eor LO.16b, LO.16b, LO.16b
+ eor MI.16b, MI.16b, MI.16b
+ eor HI.16b, HI.16b, HI.16b
+
+ ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
+ ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
+
+ karatsuba1 M7 KEY1
+ .if \reduce
+ pmull TMP_V.1q, PL.1d, GSTAR.1d
+ .endif
+
+ karatsuba1 M6 KEY2
+ .if \reduce
+ ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+ .endif
+
+ karatsuba1 M5 KEY3
+ .if \reduce
+ eor TMP_V.16b, PL.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M4 KEY4
+ .if \reduce
+ eor PH.16b, PH.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M3 KEY5
+ .if \reduce
+ pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
+ .endif
+
+ karatsuba1 M2 KEY6
+ .if \reduce
+ eor SUM.16b, PH.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M1 KEY7
+ eor M0.16b, M0.16b, SUM.16b
+
+ karatsuba1 M0 KEY8
+ karatsuba2
+.endm
+
+/*
+ * Handle any extra blocks after full_stride loop.
+ */
+.macro partial_stride
+ add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
+ sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
+ ld1 {KEY1.16b}, [KEY_POWERS], #16
+
+ ld1 {TMP_V.16b}, [MSG], #16
+ eor SUM.16b, SUM.16b, TMP_V.16b
+ karatsuba1_store KEY1 SUM
+ sub BLOCKS_LEFT, BLOCKS_LEFT, #1
+
+ tst BLOCKS_LEFT, #4
+ beq .Lpartial4BlocksDone
+ ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
+ ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
+ karatsuba1 M0 KEY8
+ karatsuba1 M1 KEY7
+ karatsuba1 M2 KEY6
+ karatsuba1 M3 KEY5
+.Lpartial4BlocksDone:
+ tst BLOCKS_LEFT, #2
+ beq .Lpartial2BlocksDone
+ ld1 {M0.16b, M1.16b}, [MSG], #32
+ ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
+ karatsuba1 M0 KEY8
+ karatsuba1 M1 KEY7
+.Lpartial2BlocksDone:
+ tst BLOCKS_LEFT, #1
+ beq .LpartialDone
+ ld1 {M0.16b}, [MSG], #16
+ ld1 {KEY8.16b}, [KEY_POWERS], #16
+ karatsuba1 M0 KEY8
+.LpartialDone:
+ karatsuba2
+ montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pmull(struct polyval_elem *a,
+ * const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pmull)
+ adr TMP, .Lgstar
+ ld1 {GSTAR.2d}, [TMP]
+ ld1 {v0.16b}, [x0]
+ ld1 {v1.16b}, [x1]
+ karatsuba1_store v0 v1
+ karatsuba2
+ montgomery_reduction SUM
+ st1 {SUM.16b}, [x0]
+ ret
+SYM_FUNC_END(polyval_mul_pmull)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL. This computes:
+ * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * x0 - pointer to accumulator
+ * x1 - pointer to precomputed key powers h^8 ... h^1
+ * x2 - pointer to message blocks
+ * x3 - number of blocks to hash
+ *
+ * void polyval_blocks_pmull(struct polyval_elem *acc,
+ * const struct polyval_key *key,
+ * const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pmull)
+ adr TMP, .Lgstar
+ mov KEY_START, KEY_POWERS
+ ld1 {GSTAR.2d}, [TMP]
+ ld1 {SUM.16b}, [ACCUMULATOR]
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ blt .LstrideLoopExit
+ ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
+ ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
+ full_stride 0
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ blt .LstrideLoopExitReduce
+.LstrideLoop:
+ full_stride 1
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ bge .LstrideLoop
+.LstrideLoopExitReduce:
+ montgomery_reduction SUM
+.LstrideLoopExit:
+ adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ beq .LskipPartial
+ partial_stride
+.LskipPartial:
+ st1 {SUM.16b}, [ACCUMULATOR]
+ ret
+SYM_FUNC_END(polyval_blocks_pmull)
diff --git a/lib/crypto/arm64/polyval.h b/lib/crypto/arm64/polyval.h
new file mode 100644
index 000000000000..a39763395e9b
--- /dev/null
+++ b/lib/crypto/arm64/polyval.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, arm64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+asmlinkage void polyval_mul_pmull(struct polyval_elem *a,
+ const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pmull(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+ const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+ static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+ memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+ if (static_branch_likely(&have_pmull) && may_use_simd()) {
+ scoped_ksimd() {
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_pmull(
+ &key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ }
+ } else {
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_generic(&key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ }
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+ const struct polyval_key *key)
+{
+ if (static_branch_likely(&have_pmull) && may_use_simd()) {
+ scoped_ksimd()
+ polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ } else {
+ polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ }
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_pmull) && may_use_simd()) {
+ do {
+ /* Allow rescheduling every 4 KiB. */
+ size_t n = min_t(size_t, nblocks,
+ 4096 / POLYVAL_BLOCK_SIZE);
+
+ scoped_ksimd()
+ polyval_blocks_pmull(acc, key, data, n);
+ data += n * POLYVAL_BLOCK_SIZE;
+ nblocks -= n;
+ } while (nblocks);
+ } else {
+ polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+ data, nblocks);
+ }
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(PMULL))
+ static_branch_enable(&have_pmull);
+}
diff --git a/lib/crypto/arm64/sha1-ce-core.S b/lib/crypto/arm64/sha1-ce-core.S
new file mode 100644
index 000000000000..8fbd4767f0f0
--- /dev/null
+++ b/lib/crypto/arm64/sha1-ce-core.S
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .arch armv8-a+crypto
+
+ k0 .req v0
+ k1 .req v1
+ k2 .req v2
+ k3 .req v3
+
+ t0 .req v4
+ t1 .req v5
+
+ dga .req q6
+ dgav .req v6
+ dgb .req s7
+ dgbv .req v7
+
+ dg0q .req q12
+ dg0s .req s12
+ dg0v .req v12
+ dg1s .req s13
+ dg1v .req v13
+ dg2s .req s14
+
+ .macro add_only, op, ev, rc, s0, dg1
+ .ifc \ev, ev
+ add t1.4s, v\s0\().4s, \rc\().4s
+ sha1h dg2s, dg0s
+ .ifnb \dg1
+ sha1\op dg0q, \dg1, t0.4s
+ .else
+ sha1\op dg0q, dg1s, t0.4s
+ .endif
+ .else
+ .ifnb \s0
+ add t0.4s, v\s0\().4s, \rc\().4s
+ .endif
+ sha1h dg1s, dg0s
+ sha1\op dg0q, dg2s, t1.4s
+ .endif
+ .endm
+
+ .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
+ sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s
+ add_only \op, \ev, \rc, \s1, \dg1
+ sha1su1 v\s0\().4s, v\s3\().4s
+ .endm
+
+ .macro loadrc, k, val, tmp
+ movz \tmp, :abs_g0_nc:\val
+ movk \tmp, :abs_g1:\val
+ dup \k, \tmp
+ .endm
+
+ /*
+ * size_t __sha1_ce_transform(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(__sha1_ce_transform)
+ /* load round constants */
+ loadrc k0.4s, 0x5a827999, w6
+ loadrc k1.4s, 0x6ed9eba1, w6
+ loadrc k2.4s, 0x8f1bbcdc, w6
+ loadrc k3.4s, 0xca62c1d6, w6
+
+ /* load state */
+ ld1 {dgav.4s}, [x0]
+ ldr dgb, [x0, #16]
+
+ /* load input */
+0: ld1 {v8.4s-v11.4s}, [x1], #64
+ sub x2, x2, #1
+
+CPU_LE( rev32 v8.16b, v8.16b )
+CPU_LE( rev32 v9.16b, v9.16b )
+CPU_LE( rev32 v10.16b, v10.16b )
+CPU_LE( rev32 v11.16b, v11.16b )
+
+ add t0.4s, v8.4s, k0.4s
+ mov dg0v.16b, dgav.16b
+
+ add_update c, ev, k0, 8, 9, 10, 11, dgb
+ add_update c, od, k0, 9, 10, 11, 8
+ add_update c, ev, k0, 10, 11, 8, 9
+ add_update c, od, k0, 11, 8, 9, 10
+ add_update c, ev, k1, 8, 9, 10, 11
+
+ add_update p, od, k1, 9, 10, 11, 8
+ add_update p, ev, k1, 10, 11, 8, 9
+ add_update p, od, k1, 11, 8, 9, 10
+ add_update p, ev, k1, 8, 9, 10, 11
+ add_update p, od, k2, 9, 10, 11, 8
+
+ add_update m, ev, k2, 10, 11, 8, 9
+ add_update m, od, k2, 11, 8, 9, 10
+ add_update m, ev, k2, 8, 9, 10, 11
+ add_update m, od, k2, 9, 10, 11, 8
+ add_update m, ev, k3, 10, 11, 8, 9
+
+ add_update p, od, k3, 11, 8, 9, 10
+ add_only p, ev, k3, 9
+ add_only p, od, k3, 10
+ add_only p, ev, k3, 11
+ add_only p, od
+
+ /* update state */
+ add dgbv.2s, dgbv.2s, dg1v.2s
+ add dgav.4s, dgav.4s, dg0v.4s
+
+ /* return early if voluntary preemption is needed */
+ cond_yield 1f, x5, x6
+
+ /* handled all input blocks? */
+ cbnz x2, 0b
+
+ /* store new state */
+1: st1 {dgav.4s}, [x0]
+ str dgb, [x0, #16]
+ mov x0, x2
+ ret
+SYM_FUNC_END(__sha1_ce_transform)
diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h
new file mode 100644
index 000000000000..bc7071f1be09
--- /dev/null
+++ b/lib/crypto/arm64/sha1.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for ARM64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage size_t __sha1_ce_transform(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_ce) && likely(may_use_simd())) {
+ do {
+ size_t rem;
+
+ scoped_ksimd()
+ rem = __sha1_ce_transform(state, data, nblocks);
+
+ data += (nblocks - rem) * SHA1_BLOCK_SIZE;
+ nblocks = rem;
+ } while (nblocks);
+ } else {
+ sha1_blocks_generic(state, data, nblocks);
+ }
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(SHA1))
+ static_branch_enable(&have_ce);
+}
diff --git a/lib/crypto/arm64/sha2-armv8.pl b/lib/crypto/arm64/sha2-armv8.pl
new file mode 100644
index 000000000000..35ec9ae99fe1
--- /dev/null
+++ b/lib/crypto/arm64/sha2-armv8.pl
@@ -0,0 +1,786 @@
+#! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+# SHA256-hw SHA256(*) SHA512
+# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+# Denver 2.01 10.5 (+26%) 6.70 (+8%)
+# X-Gene 20.0 (+100%) 12.8 (+300%(***))
+# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+#
+# (*) Software SHA256 results are of lesser relevance, presented
+# mostly for informational purposes.
+# (**) The result is a trade-off: it's possible to improve it by
+# 10% (or by 1 cycle per round), but at the cost of 20% loss
+# on Cortex-A53 (or by 4 cycles per round).
+# (***) Super-impressive coefficients over gcc-generated code are
+# indication of some compiler "pathology", most notably code
+# generated with -mgeneral-regs-only is significantly faster
+# and the gap is only 40-90%.
+#
+# October 2016.
+#
+# Originally it was reckoned that it makes no sense to implement NEON
+# version of SHA256 for 64-bit processors. This is because performance
+# improvement on most wide-spread Cortex-A5x processors was observed
+# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
+# observed that 32-bit NEON SHA256 performs significantly better than
+# 64-bit scalar version on *some* of the more recent processors. As
+# result 64-bit NEON version of SHA256 was added to provide best
+# all-round performance. For example it executes ~30% faster on X-Gene
+# and Mongoose. [For reference, NEON version of SHA512 is bound to
+# deliver much less improvement, likely *negative* on Cortex-A5x.
+# Which is why NEON support is limited to SHA256.]
+
+$output=pop;
+$flavour=pop;
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open OUT,"| \"$^X\" $xlate $flavour $output";
+ *STDOUT=*OUT;
+} else {
+ open STDOUT,">$output";
+}
+
+if ($output =~ /512/) {
+ $BITS=512;
+ $SZ=8;
+ @Sigma0=(28,34,39);
+ @Sigma1=(14,18,41);
+ @sigma0=(1, 8, 7);
+ @sigma1=(19,61, 6);
+ $rounds=80;
+ $reg_t="x";
+} else {
+ $BITS=256;
+ $SZ=4;
+ @Sigma0=( 2,13,22);
+ @Sigma1=( 6,11,25);
+ @sigma0=( 7,18, 3);
+ @sigma1=(17,19,10);
+ $rounds=64;
+ $reg_t="w";
+}
+
+$func="sha${BITS}_block_data_order";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+ $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___ if ($i<16);
+#ifndef __AARCH64EB__
+ rev @X[$i],@X[$i] // $i
+#endif
+___
+$code.=<<___ if ($i<13 && ($i&1));
+ ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___ if ($i==13);
+ ldp @X[14],@X[15],[$inp]
+___
+$code.=<<___ if ($i>=14);
+ ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___ if ($i>0 && $i<16);
+ add $a,$a,$t1 // h+=Sigma0(a)
+___
+$code.=<<___ if ($i>=11);
+ str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___ if ($i<15);
+ ror $t0,$e,#$Sigma1[0]
+ add $h,$h,$t2 // h+=K[i]
+ eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+ and $t1,$f,$e
+ bic $t2,$g,$e
+ add $h,$h,@X[$i&15] // h+=X[i]
+ orr $t1,$t1,$t2 // Ch(e,f,g)
+ eor $t2,$a,$b // a^b, b^c in next round
+ eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
+ ror $T0,$a,#$Sigma0[0]
+ add $h,$h,$t1 // h+=Ch(e,f,g)
+ eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+ add $h,$h,$t0 // h+=Sigma1(e)
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
+ add $d,$d,$h // d+=h
+ eor $t3,$t3,$b // Maj(a,b,c)
+ eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
+ add $h,$h,$t3 // h+=Maj(a,b,c)
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
+ //add $h,$h,$t1 // h+=Sigma0(a)
+___
+$code.=<<___ if ($i>=15);
+ ror $t0,$e,#$Sigma1[0]
+ add $h,$h,$t2 // h+=K[i]
+ ror $T1,@X[($j+1)&15],#$sigma0[0]
+ and $t1,$f,$e
+ ror $T2,@X[($j+14)&15],#$sigma1[0]
+ bic $t2,$g,$e
+ ror $T0,$a,#$Sigma0[0]
+ add $h,$h,@X[$i&15] // h+=X[i]
+ eor $t0,$t0,$e,ror#$Sigma1[1]
+ eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+ orr $t1,$t1,$t2 // Ch(e,f,g)
+ eor $t2,$a,$b // a^b, b^c in next round
+ eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
+ eor $T0,$T0,$a,ror#$Sigma0[1]
+ add $h,$h,$t1 // h+=Ch(e,f,g)
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
+ eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+ eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
+ add $h,$h,$t0 // h+=Sigma1(e)
+ eor $t3,$t3,$b // Maj(a,b,c)
+ eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
+ eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
+ add @X[$j],@X[$j],@X[($j+9)&15]
+ add $d,$d,$h // d+=h
+ add $h,$h,$t3 // h+=Maj(a,b,c)
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
+ add @X[$j],@X[$j],$T1
+ add $h,$h,$t1 // h+=Sigma0(a)
+ add @X[$j],@X[$j],$T2
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#endif
+
+.text
+
+.extern OPENSSL_armcap_P
+.globl $func
+.type $func,%function
+.align 6
+$func:
+___
+$code.=<<___ if ($SZ==4);
+#ifndef __KERNEL__
+# ifdef __ILP32__
+ ldrsw x16,.LOPENSSL_armcap_P
+# else
+ ldr x16,.LOPENSSL_armcap_P
+# endif
+ adr x17,.LOPENSSL_armcap_P
+ add x16,x16,x17
+ ldr w16,[x16]
+ tst w16,#ARMV8_SHA256
+ b.ne .Lv8_entry
+ tst w16,#ARMV7_NEON
+ b.ne .Lneon_entry
+#endif
+___
+$code.=<<___;
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*$SZ
+
+ ldp $A,$B,[$ctx] // load context
+ ldp $C,$D,[$ctx,#2*$SZ]
+ ldp $E,$F,[$ctx,#4*$SZ]
+ add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
+ ldp $G,$H,[$ctx,#6*$SZ]
+ adr $Ktbl,.LK$BITS
+ stp $ctx,$num,[x29,#96]
+
+.Loop:
+ ldp @X[0],@X[1],[$inp],#2*$SZ
+ ldr $t2,[$Ktbl],#$SZ // *K++
+ eor $t3,$B,$C // magic seed
+ str $inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ cbnz $t2,.Loop_16_xx
+
+ ldp $ctx,$num,[x29,#96]
+ ldr $inp,[x29,#112]
+ sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
+
+ ldp @X[0],@X[1],[$ctx]
+ ldp @X[2],@X[3],[$ctx,#2*$SZ]
+ add $inp,$inp,#14*$SZ // advance input pointer
+ ldp @X[4],@X[5],[$ctx,#4*$SZ]
+ add $A,$A,@X[0]
+ ldp @X[6],@X[7],[$ctx,#6*$SZ]
+ add $B,$B,@X[1]
+ add $C,$C,@X[2]
+ add $D,$D,@X[3]
+ stp $A,$B,[$ctx]
+ add $E,$E,@X[4]
+ add $F,$F,@X[5]
+ stp $C,$D,[$ctx,#2*$SZ]
+ add $G,$G,@X[6]
+ add $H,$H,@X[7]
+ cmp $inp,$num
+ stp $E,$F,[$ctx,#4*$SZ]
+ stp $G,$H,[$ctx,#6*$SZ]
+ b.ne .Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*$SZ
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ ret
+.size $func,.-$func
+
+.align 6
+.type .LK$BITS,%object
+.LK$BITS:
+___
+$code.=<<___ if ($SZ==8);
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+ .quad 0 // terminator
+___
+$code.=<<___ if ($SZ==4);
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0 //terminator
+___
+$code.=<<___;
+.size .LK$BITS,.-.LK$BITS
+#ifndef __KERNEL__
+.align 3
+.LOPENSSL_armcap_P:
+# ifdef __ILP32__
+ .long OPENSSL_armcap_P-.
+# else
+ .quad OPENSSL_armcap_P-.
+# endif
+#endif
+.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+#ifndef __KERNEL__
+.type sha256_block_armv8,%function
+.align 6
+sha256_block_armv8:
+.Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1.32 {$ABCD,$EFGH},[$ctx]
+ adr $Ktbl,.LK256
+
+.Loop_hw:
+ ld1 {@MSG[0]-@MSG[3]},[$inp],#64
+ sub $num,$num,#1
+ ld1.32 {$W0},[$Ktbl],#16
+ rev32 @MSG[0],@MSG[0]
+ rev32 @MSG[1],@MSG[1]
+ rev32 @MSG[2],@MSG[2]
+ rev32 @MSG[3],@MSG[3]
+ orr $ABCD_SAVE,$ABCD,$ABCD // offload
+ orr $EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ ld1.32 {$W1},[$Ktbl],#16
+ add.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ ld1.32 {$W1},[$Ktbl],#16
+ add.i32 $W0,$W0,@MSG[0]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ ld1.32 {$W0},[$Ktbl],#16
+ add.i32 $W1,$W1,@MSG[1]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ ld1.32 {$W1},[$Ktbl]
+ add.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ add.i32 $W1,$W1,@MSG[3]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ add.i32 $ABCD,$ABCD,$ABCD_SAVE
+ add.i32 $EFGH,$EFGH,$EFGH_SAVE
+
+ cbnz $num,.Loop_hw
+
+ st1.32 {$ABCD,$EFGH},[$ctx]
+
+ ldr x29,[sp],#16
+ ret
+.size sha256_block_armv8,.-sha256_block_armv8
+#endif
+___
+}
+
+if ($SZ==4) { ######################################### NEON stuff #
+# You'll surely note a lot of similarities with sha256-armv4 module,
+# and of course it's not a coincidence. sha256-armv4 was used as
+# initial template, but was adapted for ARMv8 instruction set and
+# extensively re-tuned for all-round performance.
+
+my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
+my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
+my $Ktbl="x16";
+my $Xfer="x17";
+my @X = map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
+my $j=0;
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
+sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
+sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &ext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ext_8 ($T3,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ &ushr_32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ &sli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T4,$T7,$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T4,$T7,32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T5,$T7,$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T3,$T7,$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_u32 ($T3,$T7,32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T6,@X[0],$sigma1[0]);
+ eval(shift(@insns));
+ &ushr_32 ($T7,@X[0],$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T6,@X[0],32-$sigma1[0]);
+ eval(shift(@insns));
+ &ushr_32 ($T5,@X[0],$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T7,$T7,$T6);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T5,@X[0],32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ld1_32 ("{$T0}","[$Ktbl], #16");
+ eval(shift(@insns));
+ &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T5,$T5,$T5);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &mov (&Dhi($T5), &Dlo($T7));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 ($T0,$T0,@X[0]);
+ while($#insns>=1) { eval(shift(@insns)); }
+ &st1_32 ("{$T0}","[$Xfer], #16");
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ld1_8 ("{@X[0]}","[$inp],#16");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ld1_32 ("{$T0}","[$Ktbl],#16");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &rev32 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &st1_32 ("{$T0}","[$Xfer], #16");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past
+ '&and ($t1,$f,$e)',
+ '&bic ($t4,$g,$e)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&orr ($t1,$t1,$t4)', # Ch(e,f,g)
+ '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ror ($t0,$t0,"#$Sigma1[0]")',
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t0)', # h+=Sigma1(e)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&ror ($t4,$t4,"#$Sigma0[0]")',
+ '&add ($d,$d,$h)', # d+=h
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#ifdef __KERNEL__
+.globl sha256_block_neon
+#endif
+.type sha256_block_neon,%function
+.align 4
+sha256_block_neon:
+.Lneon_entry:
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ sub sp,sp,#16*4
+
+ adr $Ktbl,.LK256
+ add $num,$inp,$num,lsl#6 // len to point at the end of inp
+
+ ld1.8 {@X[0]},[$inp], #16
+ ld1.8 {@X[1]},[$inp], #16
+ ld1.8 {@X[2]},[$inp], #16
+ ld1.8 {@X[3]},[$inp], #16
+ ld1.32 {$T0},[$Ktbl], #16
+ ld1.32 {$T1},[$Ktbl], #16
+ ld1.32 {$T2},[$Ktbl], #16
+ ld1.32 {$T3},[$Ktbl], #16
+ rev32 @X[0],@X[0] // yes, even on
+ rev32 @X[1],@X[1] // big-endian
+ rev32 @X[2],@X[2]
+ rev32 @X[3],@X[3]
+ mov $Xfer,sp
+ add.32 $T0,$T0,@X[0]
+ add.32 $T1,$T1,@X[1]
+ add.32 $T2,$T2,@X[2]
+ st1.32 {$T0-$T1},[$Xfer], #32
+ add.32 $T3,$T3,@X[3]
+ st1.32 {$T2-$T3},[$Xfer]
+ sub $Xfer,$Xfer,#32
+
+ ldp $A,$B,[$ctx]
+ ldp $C,$D,[$ctx,#8]
+ ldp $E,$F,[$ctx,#16]
+ ldp $G,$H,[$ctx,#24]
+ ldr $t1,[sp,#0]
+ mov $t2,wzr
+ eor $t3,$B,$C
+ mov $t4,wzr
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ cmp $t1,#0 // check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ sub $Ktbl,$Ktbl,#256 // rewind $Ktbl
+ cmp $inp,$num
+ mov $Xfer, #64
+ csel $Xfer, $Xfer, xzr, eq
+ sub $inp,$inp,$Xfer // avoid SEGV
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ add $A,$A,$t4 // h+=Sigma0(a) from the past
+ ldp $t0,$t1,[$ctx,#0]
+ add $A,$A,$t2 // h+=Maj(a,b,c) from the past
+ ldp $t2,$t3,[$ctx,#8]
+ add $A,$A,$t0 // accumulate
+ add $B,$B,$t1
+ ldp $t0,$t1,[$ctx,#16]
+ add $C,$C,$t2
+ add $D,$D,$t3
+ ldp $t2,$t3,[$ctx,#24]
+ add $E,$E,$t0
+ add $F,$F,$t1
+ ldr $t1,[sp,#0]
+ stp $A,$B,[$ctx,#0]
+ add $G,$G,$t2
+ mov $t2,wzr
+ stp $C,$D,[$ctx,#8]
+ add $H,$H,$t3
+ stp $E,$F,[$ctx,#16]
+ eor $t3,$B,$C
+ stp $G,$H,[$ctx,#24]
+ mov $t4,wzr
+ mov $Xfer,sp
+ b.ne .L_00_48
+
+ ldr x29,[x29]
+ add sp,sp,#16*4+16
+ ret
+.size sha256_block_neon,.-sha256_block_neon
+___
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+{ my %opcode = (
+ "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
+ "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+ $mnemonic,$arg;
+ }
+}
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
+
+ s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
+
+ s/\.[ui]?8(\s)/$1/;
+ s/\.\w?32\b// and s/\.16b/\.4s/g;
+ m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
new file mode 100644
index 000000000000..e4bfe42a61a9
--- /dev/null
+++ b/lib/crypto/arm64/sha256-ce.S
@@ -0,0 +1,408 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Core SHA-224/SHA-256 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .arch armv8-a+crypto
+
+ dga .req q20
+ dgav .req v20
+ dgb .req q21
+ dgbv .req v21
+
+ t0 .req v22
+ t1 .req v23
+
+ dg0q .req q24
+ dg0v .req v24
+ dg1q .req q25
+ dg1v .req v25
+ dg2q .req q26
+ dg2v .req v26
+
+ .macro add_only, ev, rc, s0
+ mov dg2v.16b, dg0v.16b
+ .ifeq \ev
+ add t1.4s, v\s0\().4s, \rc\().4s
+ sha256h dg0q, dg1q, t0.4s
+ sha256h2 dg1q, dg2q, t0.4s
+ .else
+ .ifnb \s0
+ add t0.4s, v\s0\().4s, \rc\().4s
+ .endif
+ sha256h dg0q, dg1q, t1.4s
+ sha256h2 dg1q, dg2q, t1.4s
+ .endif
+ .endm
+
+ .macro add_update, ev, rc, s0, s1, s2, s3
+ sha256su0 v\s0\().4s, v\s1\().4s
+ add_only \ev, \rc, \s1
+ sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
+ .endm
+
+ /*
+ * The SHA-256 round constants
+ */
+ .section ".rodata", "a"
+ .align 4
+.Lsha2_rcon:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+ .macro load_round_constants tmp
+ adr_l \tmp, .Lsha2_rcon
+ ld1 { v0.4s- v3.4s}, [\tmp], #64
+ ld1 { v4.4s- v7.4s}, [\tmp], #64
+ ld1 { v8.4s-v11.4s}, [\tmp], #64
+ ld1 {v12.4s-v15.4s}, [\tmp]
+ .endm
+
+ /*
+ * size_t __sha256_ce_transform(struct sha256_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+ .text
+SYM_FUNC_START(__sha256_ce_transform)
+
+ load_round_constants x8
+
+ /* load state */
+ ld1 {dgav.4s, dgbv.4s}, [x0]
+
+ /* load input */
+0: ld1 {v16.4s-v19.4s}, [x1], #64
+ sub x2, x2, #1
+
+CPU_LE( rev32 v16.16b, v16.16b )
+CPU_LE( rev32 v17.16b, v17.16b )
+CPU_LE( rev32 v18.16b, v18.16b )
+CPU_LE( rev32 v19.16b, v19.16b )
+
+ add t0.4s, v16.4s, v0.4s
+ mov dg0v.16b, dgav.16b
+ mov dg1v.16b, dgbv.16b
+
+ add_update 0, v1, 16, 17, 18, 19
+ add_update 1, v2, 17, 18, 19, 16
+ add_update 0, v3, 18, 19, 16, 17
+ add_update 1, v4, 19, 16, 17, 18
+
+ add_update 0, v5, 16, 17, 18, 19
+ add_update 1, v6, 17, 18, 19, 16
+ add_update 0, v7, 18, 19, 16, 17
+ add_update 1, v8, 19, 16, 17, 18
+
+ add_update 0, v9, 16, 17, 18, 19
+ add_update 1, v10, 17, 18, 19, 16
+ add_update 0, v11, 18, 19, 16, 17
+ add_update 1, v12, 19, 16, 17, 18
+
+ add_only 0, v13, 17
+ add_only 1, v14, 18
+ add_only 0, v15, 19
+ add_only 1
+
+ /* update state */
+ add dgav.4s, dgav.4s, dg0v.4s
+ add dgbv.4s, dgbv.4s, dg1v.4s
+
+ /* return early if voluntary preemption is needed */
+ cond_yield 1f, x5, x6
+
+ /* handled all input blocks? */
+ cbnz x2, 0b
+
+ /* store new state */
+1: st1 {dgav.4s, dgbv.4s}, [x0]
+ mov x0, x2
+ ret
+SYM_FUNC_END(__sha256_ce_transform)
+
+ .unreq dga
+ .unreq dgav
+ .unreq dgb
+ .unreq dgbv
+ .unreq t0
+ .unreq t1
+ .unreq dg0q
+ .unreq dg0v
+ .unreq dg1q
+ .unreq dg1v
+ .unreq dg2q
+ .unreq dg2v
+
+ // parameters for sha256_ce_finup2x()
+ ctx .req x0
+ data1 .req x1
+ data2 .req x2
+ len .req w3
+ out1 .req x4
+ out2 .req x5
+
+ // other scalar variables
+ count .req x6
+ final_step .req w7
+
+ // x8-x9 are used as temporaries.
+
+ // v0-v15 are used to cache the SHA-256 round constants.
+ // v16-v19 are used for the message schedule for the first message.
+ // v20-v23 are used for the message schedule for the second message.
+ // v24-v31 are used for the state and temporaries as given below.
+ // *_a are for the first message and *_b for the second.
+ state0_a_q .req q24
+ state0_a .req v24
+ state1_a_q .req q25
+ state1_a .req v25
+ state0_b_q .req q26
+ state0_b .req v26
+ state1_b_q .req q27
+ state1_b .req v27
+ t0_a .req v28
+ t0_b .req v29
+ t1_a_q .req q30
+ t1_a .req v30
+ t1_b_q .req q31
+ t1_b .req v31
+
+#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
+// offsetof(struct __sha256_ctx, state) is assumed to be 0.
+
+ // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
+ // and m0_b contain the current 4 message schedule words for the first
+ // and second message respectively.
+ //
+ // If not all the message schedule words have been computed yet, then
+ // this also computes 4 more message schedule words for each message.
+ // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
+ // the first message, and likewise m1_b-m3_b for the second. After
+ // consuming the current value of m0_a, this macro computes the group
+ // after m3_a and writes it to m0_a, and likewise for *_b. This means
+ // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
+ // m3_a, m0_a), and likewise for *_b, so the caller must cycle through
+ // the registers accordingly.
+ .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
+ m0_b, m1_b, m2_b, m3_b
+ add t0_a\().4s, \m0_a\().4s, \k\().4s
+ add t0_b\().4s, \m0_b\().4s, \k\().4s
+ .if \i < 48
+ sha256su0 \m0_a\().4s, \m1_a\().4s
+ sha256su0 \m0_b\().4s, \m1_b\().4s
+ sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
+ sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
+ .endif
+ mov t1_a.16b, state0_a.16b
+ mov t1_b.16b, state0_b.16b
+ sha256h state0_a_q, state1_a_q, t0_a\().4s
+ sha256h state0_b_q, state1_b_q, t0_b\().4s
+ sha256h2 state1_a_q, t1_a_q, t0_a\().4s
+ sha256h2 state1_b_q, t1_b_q, t0_b\().4s
+ .endm
+
+ .macro do_16rounds_2x i, k0, k1, k2, k3
+ do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
+ do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
+ do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
+ do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
+ .endm
+
+//
+// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+// const u8 *data1, const u8 *data2, int len,
+// u8 out1[SHA256_DIGEST_SIZE],
+// u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved. On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ce_finup2x)
+ sub sp, sp, #128
+ mov final_step, #0
+ load_round_constants x8
+
+ // Load the initial state from ctx->state.
+ ld1 {state0_a.4s-state1_a.4s}, [ctx]
+
+ // Load ctx->bytecount. Take the mod 64 of it to get the number of
+ // bytes that are buffered in ctx->buf. Also save it in a register with
+ // len added to it.
+ ldr x8, [ctx, #OFFSETOF_BYTECOUNT]
+ add count, x8, len, sxtw
+ and x8, x8, #63
+ cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
+
+ // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
+ // followed by the first 64 - x8 bytes of data. Since len >= 64, we
+ // just load 64 bytes from each of ctx->buf, data1, and data2
+ // unconditionally and rearrange the data as needed.
+ add x9, ctx, #OFFSETOF_BUF
+ ld1 {v16.16b-v19.16b}, [x9]
+ st1 {v16.16b-v19.16b}, [sp]
+
+ ld1 {v16.16b-v19.16b}, [data1], #64
+ add x9, sp, x8
+ st1 {v16.16b-v19.16b}, [x9]
+ ld1 {v16.4s-v19.4s}, [sp]
+
+ ld1 {v20.16b-v23.16b}, [data2], #64
+ st1 {v20.16b-v23.16b}, [x9]
+ ld1 {v20.4s-v23.4s}, [sp]
+
+ sub len, len, #64
+ sub data1, data1, x8
+ sub data2, data2, x8
+ add len, len, w8
+ mov state0_b.16b, state0_a.16b
+ mov state1_b.16b, state1_a.16b
+ b .Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+ sub len, len, #64
+ mov state0_b.16b, state0_a.16b
+ mov state1_b.16b, state1_a.16b
+.Lfinup2x_loop:
+ // Load the next two data blocks.
+ ld1 {v16.4s-v19.4s}, [data1], #64
+ ld1 {v20.4s-v23.4s}, [data2], #64
+.Lfinup2x_loop_have_data:
+ // Convert the words of the data blocks from big endian.
+CPU_LE( rev32 v16.16b, v16.16b )
+CPU_LE( rev32 v17.16b, v17.16b )
+CPU_LE( rev32 v18.16b, v18.16b )
+CPU_LE( rev32 v19.16b, v19.16b )
+CPU_LE( rev32 v20.16b, v20.16b )
+CPU_LE( rev32 v21.16b, v21.16b )
+CPU_LE( rev32 v22.16b, v22.16b )
+CPU_LE( rev32 v23.16b, v23.16b )
+.Lfinup2x_loop_have_bswapped_data:
+
+ // Save the original state for each block.
+ st1 {state0_a.4s-state1_b.4s}, [sp]
+
+ // Do the SHA-256 rounds on each block.
+ do_16rounds_2x 0, v0, v1, v2, v3
+ do_16rounds_2x 16, v4, v5, v6, v7
+ do_16rounds_2x 32, v8, v9, v10, v11
+ do_16rounds_2x 48, v12, v13, v14, v15
+
+ // Add the original state for each block.
+ ld1 {v16.4s-v19.4s}, [sp]
+ add state0_a.4s, state0_a.4s, v16.4s
+ add state1_a.4s, state1_a.4s, v17.4s
+ add state0_b.4s, state0_b.4s, v18.4s
+ add state1_b.4s, state1_b.4s, v19.4s
+
+ // Update len and loop back if more blocks remain.
+ sub len, len, #64
+ tbz len, #31, .Lfinup2x_loop // len >= 0?
+
+ // Check if any final blocks need to be handled.
+ // final_step = 2: all done
+ // final_step = 1: need to do count-only padding block
+ // final_step = 0: need to do the block with 0x80 padding byte
+ tbnz final_step, #1, .Lfinup2x_done
+ tbnz final_step, #0, .Lfinup2x_finalize_countonly
+ add len, len, #64
+ cbz len, .Lfinup2x_finalize_blockaligned
+
+ // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
+ // To do this, write the padding starting with the 0x80 byte to
+ // &sp[64]. Then for each message, copy the last 64 data bytes to sp
+ // and load from &sp[64 - len] to get the needed padding block. This
+ // code relies on the data buffers being >= 64 bytes in length.
+ sub w8, len, #64 // w8 = len - 64
+ add data1, data1, w8, sxtw // data1 += len - 64
+ add data2, data2, w8, sxtw // data2 += len - 64
+CPU_LE( mov x9, #0x80 )
+CPU_LE( fmov d16, x9 )
+CPU_BE( movi v16.16b, #0 )
+CPU_BE( mov x9, #0x8000000000000000 )
+CPU_BE( mov v16.d[1], x9 )
+ movi v17.16b, #0
+ stp q16, q17, [sp, #64]
+ stp q17, q17, [sp, #96]
+ sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
+ cmp len, #56
+ b.ge 1f // will count spill into its own block?
+ lsl count, count, #3
+CPU_LE( rev count, count )
+ str count, [x9, #56]
+ mov final_step, #2 // won't need count-only block
+ b 2f
+1:
+ mov final_step, #1 // will need count-only block
+2:
+ ld1 {v16.16b-v19.16b}, [data1]
+ st1 {v16.16b-v19.16b}, [sp]
+ ld1 {v16.4s-v19.4s}, [x9]
+ ld1 {v20.16b-v23.16b}, [data2]
+ st1 {v20.16b-v23.16b}, [sp]
+ ld1 {v20.4s-v23.4s}, [x9]
+ b .Lfinup2x_loop_have_data
+
+ // Prepare a padding block, either:
+ //
+ // {0x80, 0, 0, 0, ..., count (as __be64)}
+ // This is for a block aligned message.
+ //
+ // { 0, 0, 0, 0, ..., count (as __be64)}
+ // This is for a message whose length mod 64 is >= 56.
+ //
+ // Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+ movi v16.2d, #0
+ b 1f
+.Lfinup2x_finalize_blockaligned:
+ mov x8, #0x80000000
+ fmov d16, x8
+1:
+ movi v17.2d, #0
+ movi v18.2d, #0
+ ror count, count, #29 // ror(lsl(count, 3), 32)
+ mov v19.d[0], xzr
+ mov v19.d[1], count
+ mov v20.16b, v16.16b
+ movi v21.2d, #0
+ movi v22.2d, #0
+ mov v23.16b, v19.16b
+ mov final_step, #2
+ b .Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+ // Write the two digests with all bytes in the correct order.
+CPU_LE( rev32 state0_a.16b, state0_a.16b )
+CPU_LE( rev32 state1_a.16b, state1_a.16b )
+CPU_LE( rev32 state0_b.16b, state0_b.16b )
+CPU_LE( rev32 state1_b.16b, state1_b.16b )
+ st1 {state0_a.4s-state1_a.4s}, [out1]
+ st1 {state0_b.4s-state1_b.4s}, [out2]
+ add sp, sp, #128
+ ret
+SYM_FUNC_END(sha256_ce_finup2x)
diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h
new file mode 100644
index 000000000000..568dff0f276a
--- /dev/null
+++ b/lib/crypto/arm64/sha256.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for ARM64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_neon(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ if (static_branch_likely(&have_ce)) {
+ do {
+ size_t rem;
+
+ scoped_ksimd()
+ rem = __sha256_ce_transform(state, data,
+ nblocks);
+
+ data += (nblocks - rem) * SHA256_BLOCK_SIZE;
+ nblocks = rem;
+ } while (nblocks);
+ } else {
+ scoped_ksimd()
+ sha256_block_neon(state, data, nblocks);
+ }
+ } else {
+ sha256_block_data_order(state, data, nblocks);
+ }
+}
+
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, int len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, size_t len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ /*
+ * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+ * Further limit len to 65536 to avoid spending too long with preemption
+ * disabled. (Of course, in practice len is nearly always 4096 anyway.)
+ */
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
+ len <= 65536 && likely(may_use_simd())) {
+ scoped_ksimd()
+ sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
+ kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+ kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+ return true;
+ }
+ return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+ return static_key_enabled(&have_ce);
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(ASIMD)) {
+ static_branch_enable(&have_neon);
+ if (cpu_have_named_feature(SHA2))
+ static_branch_enable(&have_ce);
+ }
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm64/sha3-ce-core.S b/lib/crypto/arm64/sha3-ce-core.S
new file mode 100644
index 000000000000..ace90b506490
--- /dev/null
+++ b/lib/crypto/arm64/sha3-ce-core.S
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Core SHA-3 transform using v8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+ .set .Lv\b\().2d, \b
+ .set .Lv\b\().16b, \b
+ .endr
+
+ /*
+ * ARMv8.2 Crypto Extensions instructions
+ */
+ .macro eor3, rd, rn, rm, ra
+ .inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+ .endm
+
+ .macro rax1, rd, rn, rm
+ .inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+ .endm
+
+ .macro bcax, rd, rn, rm, ra
+ .inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+ .endm
+
+ .macro xar, rd, rn, rm, imm6
+ .inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
+ .endm
+
+ /*
+ * size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+ * size_t nblocks, size_t block_size)
+ *
+ * block_size is assumed to be one of 72 (SHA3-512), 104 (SHA3-384), 136
+ * (SHA3-256 and SHAKE256), 144 (SHA3-224), or 168 (SHAKE128).
+ */
+ .text
+SYM_FUNC_START(sha3_ce_transform)
+ /* load state */
+ add x8, x0, #32
+ ld1 { v0.1d- v3.1d}, [x0]
+ ld1 { v4.1d- v7.1d}, [x8], #32
+ ld1 { v8.1d-v11.1d}, [x8], #32
+ ld1 {v12.1d-v15.1d}, [x8], #32
+ ld1 {v16.1d-v19.1d}, [x8], #32
+ ld1 {v20.1d-v23.1d}, [x8], #32
+ ld1 {v24.1d}, [x8]
+
+0: sub x2, x2, #1
+ mov w8, #24
+ adr_l x9, .Lsha3_rcon
+
+ /* load input */
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b}, [x1], #8
+ eor v0.8b, v0.8b, v25.8b
+ eor v1.8b, v1.8b, v26.8b
+ eor v2.8b, v2.8b, v27.8b
+ eor v3.8b, v3.8b, v28.8b
+ eor v4.8b, v4.8b, v29.8b
+
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v5.8b, v5.8b, v25.8b
+ eor v6.8b, v6.8b, v26.8b
+ eor v7.8b, v7.8b, v27.8b
+ eor v8.8b, v8.8b, v28.8b
+ cmp x3, #72
+ b.eq 3f /* SHA3-512 (block_size=72)? */
+
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v9.8b, v9.8b, v25.8b
+ eor v10.8b, v10.8b, v26.8b
+ eor v11.8b, v11.8b, v27.8b
+ eor v12.8b, v12.8b, v28.8b
+ cmp x3, #104
+ b.eq 3f /* SHA3-384 (block_size=104)? */
+
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v13.8b, v13.8b, v25.8b
+ eor v14.8b, v14.8b, v26.8b
+ eor v15.8b, v15.8b, v27.8b
+ eor v16.8b, v16.8b, v28.8b
+ cmp x3, #144
+ b.lt 3f /* SHA3-256 or SHAKE256 (block_size=136)? */
+ b.eq 2f /* SHA3-224 (block_size=144)? */
+
+ /* SHAKE128 (block_size=168) */
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v17.8b, v17.8b, v25.8b
+ eor v18.8b, v18.8b, v26.8b
+ eor v19.8b, v19.8b, v27.8b
+ eor v20.8b, v20.8b, v28.8b
+ b 3f
+2:
+ /* SHA3-224 (block_size=144) */
+ ld1 {v25.8b}, [x1], #8
+ eor v17.8b, v17.8b, v25.8b
+
+3: sub w8, w8, #1
+
+ eor3 v29.16b, v4.16b, v9.16b, v14.16b
+ eor3 v26.16b, v1.16b, v6.16b, v11.16b
+ eor3 v28.16b, v3.16b, v8.16b, v13.16b
+ eor3 v25.16b, v0.16b, v5.16b, v10.16b
+ eor3 v27.16b, v2.16b, v7.16b, v12.16b
+ eor3 v29.16b, v29.16b, v19.16b, v24.16b
+ eor3 v26.16b, v26.16b, v16.16b, v21.16b
+ eor3 v28.16b, v28.16b, v18.16b, v23.16b
+ eor3 v25.16b, v25.16b, v15.16b, v20.16b
+ eor3 v27.16b, v27.16b, v17.16b, v22.16b
+
+ rax1 v30.2d, v29.2d, v26.2d // bc[0]
+ rax1 v26.2d, v26.2d, v28.2d // bc[2]
+ rax1 v28.2d, v28.2d, v25.2d // bc[4]
+ rax1 v25.2d, v25.2d, v27.2d // bc[1]
+ rax1 v27.2d, v27.2d, v29.2d // bc[3]
+
+ eor v0.16b, v0.16b, v30.16b
+ xar v29.2d, v1.2d, v25.2d, (64 - 1)
+ xar v1.2d, v6.2d, v25.2d, (64 - 44)
+ xar v6.2d, v9.2d, v28.2d, (64 - 20)
+ xar v9.2d, v22.2d, v26.2d, (64 - 61)
+ xar v22.2d, v14.2d, v28.2d, (64 - 39)
+ xar v14.2d, v20.2d, v30.2d, (64 - 18)
+ xar v31.2d, v2.2d, v26.2d, (64 - 62)
+ xar v2.2d, v12.2d, v26.2d, (64 - 43)
+ xar v12.2d, v13.2d, v27.2d, (64 - 25)
+ xar v13.2d, v19.2d, v28.2d, (64 - 8)
+ xar v19.2d, v23.2d, v27.2d, (64 - 56)
+ xar v23.2d, v15.2d, v30.2d, (64 - 41)
+ xar v15.2d, v4.2d, v28.2d, (64 - 27)
+ xar v28.2d, v24.2d, v28.2d, (64 - 14)
+ xar v24.2d, v21.2d, v25.2d, (64 - 2)
+ xar v8.2d, v8.2d, v27.2d, (64 - 55)
+ xar v4.2d, v16.2d, v25.2d, (64 - 45)
+ xar v16.2d, v5.2d, v30.2d, (64 - 36)
+ xar v5.2d, v3.2d, v27.2d, (64 - 28)
+ xar v27.2d, v18.2d, v27.2d, (64 - 21)
+ xar v3.2d, v17.2d, v26.2d, (64 - 15)
+ xar v25.2d, v11.2d, v25.2d, (64 - 10)
+ xar v26.2d, v7.2d, v26.2d, (64 - 6)
+ xar v30.2d, v10.2d, v30.2d, (64 - 3)
+
+ bcax v20.16b, v31.16b, v22.16b, v8.16b
+ bcax v21.16b, v8.16b, v23.16b, v22.16b
+ bcax v22.16b, v22.16b, v24.16b, v23.16b
+ bcax v23.16b, v23.16b, v31.16b, v24.16b
+ bcax v24.16b, v24.16b, v8.16b, v31.16b
+
+ ld1r {v31.2d}, [x9], #8
+
+ bcax v17.16b, v25.16b, v19.16b, v3.16b
+ bcax v18.16b, v3.16b, v15.16b, v19.16b
+ bcax v19.16b, v19.16b, v16.16b, v15.16b
+ bcax v15.16b, v15.16b, v25.16b, v16.16b
+ bcax v16.16b, v16.16b, v3.16b, v25.16b
+
+ bcax v10.16b, v29.16b, v12.16b, v26.16b
+ bcax v11.16b, v26.16b, v13.16b, v12.16b
+ bcax v12.16b, v12.16b, v14.16b, v13.16b
+ bcax v13.16b, v13.16b, v29.16b, v14.16b
+ bcax v14.16b, v14.16b, v26.16b, v29.16b
+
+ bcax v7.16b, v30.16b, v9.16b, v4.16b
+ bcax v8.16b, v4.16b, v5.16b, v9.16b
+ bcax v9.16b, v9.16b, v6.16b, v5.16b
+ bcax v5.16b, v5.16b, v30.16b, v6.16b
+ bcax v6.16b, v6.16b, v4.16b, v30.16b
+
+ bcax v3.16b, v27.16b, v0.16b, v28.16b
+ bcax v4.16b, v28.16b, v1.16b, v0.16b
+ bcax v0.16b, v0.16b, v2.16b, v1.16b
+ bcax v1.16b, v1.16b, v27.16b, v2.16b
+ bcax v2.16b, v2.16b, v28.16b, v27.16b
+
+ eor v0.16b, v0.16b, v31.16b
+
+ cbnz w8, 3b
+ cond_yield 4f, x8, x9
+ cbnz x2, 0b
+
+ /* save state */
+4: st1 { v0.1d- v3.1d}, [x0], #32
+ st1 { v4.1d- v7.1d}, [x0], #32
+ st1 { v8.1d-v11.1d}, [x0], #32
+ st1 {v12.1d-v15.1d}, [x0], #32
+ st1 {v16.1d-v19.1d}, [x0], #32
+ st1 {v20.1d-v23.1d}, [x0], #32
+ st1 {v24.1d}, [x0]
+ mov x0, x2
+ ret
+SYM_FUNC_END(sha3_ce_transform)
+
+ .section ".rodata", "a"
+ .align 8
+.Lsha3_rcon:
+ .quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a
+ .quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001
+ .quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a
+ .quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a
+ .quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089
+ .quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080
+ .quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081
+ .quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
diff --git a/lib/crypto/arm64/sha3.h b/lib/crypto/arm64/sha3.h
new file mode 100644
index 000000000000..b602f1b3b282
--- /dev/null
+++ b/lib/crypto/arm64/sha3.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+
+asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+ size_t nblocks, size_t block_size);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+ size_t nblocks, size_t block_size)
+{
+ if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+ do {
+ size_t rem;
+
+ scoped_ksimd()
+ rem = sha3_ce_transform(state, data, nblocks,
+ block_size);
+ data += (nblocks - rem) * block_size;
+ nblocks = rem;
+ } while (nblocks);
+ } else {
+ sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+ }
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+ if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+ /*
+ * Passing zeroes into sha3_ce_transform() gives the plain
+ * Keccak-f permutation, which is what we want here. Any
+ * supported block size may be used. Use SHA3_512_BLOCK_SIZE
+ * since it's the shortest.
+ */
+ static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+ scoped_ksimd()
+ sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
+ } else {
+ sha3_keccakf_generic(state);
+ }
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(SHA3))
+ static_branch_enable(&have_sha3);
+}
diff --git a/lib/crypto/arm64/sha512-ce-core.S b/lib/crypto/arm64/sha512-ce-core.S
new file mode 100644
index 000000000000..ffd51acfd1ee
--- /dev/null
+++ b/lib/crypto/arm64/sha512-ce-core.S
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Core SHA-384/SHA-512 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ /*
+ * We have to specify the "sha3" feature here, since the GNU and clang
+ * assemblers both consider the SHA-512 instructions to be part of the
+ * "sha3" feature. (Except binutils 2.30 through 2.42, which used
+ * "sha2". But "sha3" implies "sha2", so "sha3" still works in those
+ * versions.) "sha3" doesn't make a lot of sense, since SHA-512 is part
+ * of the SHA-2 family of algorithms, and also the Arm Architecture
+ * Reference Manual defines FEAT_SHA512 and FEAT_SHA3 separately.
+ * Regardless, we must use "sha3" to be compatible with the assemblers.
+ */
+ .arch armv8-a+sha3
+
+ /*
+ * The SHA-512 round constants
+ */
+ .section ".rodata", "a"
+ .align 4
+.Lsha512_rcon:
+ .quad 0x428a2f98d728ae22, 0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538, 0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242, 0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235, 0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+ .quad 0x983e5152ee66dfab, 0xa831c66d2db43210
+ .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725
+ .quad 0x06ca6351e003826f, 0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+ .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6, 0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218, 0xd69906245565a910
+ .quad 0xf40e35855771202a, 0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec
+ .quad 0x90befffa23631e28, 0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b
+ .quad 0xca273eceea26619c, 0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae, 0x1b710b35131c471b
+ .quad 0x28db77f523047d84, 0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+
+ .macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4
+ .ifnb \rc1
+ ld1 {v\rc1\().2d}, [x4], #16
+ .endif
+ add v5.2d, v\rc0\().2d, v\in0\().2d
+ ext v6.16b, v\i2\().16b, v\i3\().16b, #8
+ ext v5.16b, v5.16b, v5.16b, #8
+ ext v7.16b, v\i1\().16b, v\i2\().16b, #8
+ add v\i3\().2d, v\i3\().2d, v5.2d
+ .ifnb \in1
+ ext v5.16b, v\in3\().16b, v\in4\().16b, #8
+ sha512su0 v\in0\().2d, v\in1\().2d
+ .endif
+ sha512h q\i3, q6, v7.2d
+ .ifnb \in1
+ sha512su1 v\in0\().2d, v\in2\().2d, v5.2d
+ .endif
+ add v\i4\().2d, v\i1\().2d, v\i3\().2d
+ sha512h2 q\i3, q\i1, v\i0\().2d
+ .endm
+
+ /*
+ * size_t __sha512_ce_transform(struct sha512_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+ .text
+SYM_FUNC_START(__sha512_ce_transform)
+ /* load state */
+ ld1 {v8.2d-v11.2d}, [x0]
+
+ /* load first 4 round constants */
+ adr_l x3, .Lsha512_rcon
+ ld1 {v20.2d-v23.2d}, [x3], #64
+
+ /* load input */
+0: ld1 {v12.2d-v15.2d}, [x1], #64
+ ld1 {v16.2d-v19.2d}, [x1], #64
+ sub x2, x2, #1
+
+CPU_LE( rev64 v12.16b, v12.16b )
+CPU_LE( rev64 v13.16b, v13.16b )
+CPU_LE( rev64 v14.16b, v14.16b )
+CPU_LE( rev64 v15.16b, v15.16b )
+CPU_LE( rev64 v16.16b, v16.16b )
+CPU_LE( rev64 v17.16b, v17.16b )
+CPU_LE( rev64 v18.16b, v18.16b )
+CPU_LE( rev64 v19.16b, v19.16b )
+
+ mov x4, x3 // rc pointer
+
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+
+ // v0 ab cd -- ef gh ab
+ // v1 cd -- ef gh ab cd
+ // v2 ef gh ab cd -- ef
+ // v3 gh ab cd -- ef gh
+ // v4 -- ef gh ab cd --
+
+ dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17
+ dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18
+ dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19
+ dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12
+ dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13
+
+ dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14
+ dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15
+ dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16
+ dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17
+ dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18
+
+ dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19
+ dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12
+ dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13
+ dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14
+ dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15
+
+ dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16
+ dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17
+ dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18
+ dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19
+ dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12
+
+ dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13
+ dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14
+ dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15
+ dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16
+ dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17
+
+ dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18
+ dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19
+ dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12
+ dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13
+ dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14
+
+ dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15
+ dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16
+ dround 2, 3, 1, 4, 0, 28, 24, 12
+ dround 4, 2, 0, 1, 3, 29, 25, 13
+ dround 1, 4, 3, 0, 2, 30, 26, 14
+
+ dround 0, 1, 2, 3, 4, 31, 27, 15
+ dround 3, 0, 4, 2, 1, 24, , 16
+ dround 2, 3, 1, 4, 0, 25, , 17
+ dround 4, 2, 0, 1, 3, 26, , 18
+ dround 1, 4, 3, 0, 2, 27, , 19
+
+ /* update state */
+ add v8.2d, v8.2d, v0.2d
+ add v9.2d, v9.2d, v1.2d
+ add v10.2d, v10.2d, v2.2d
+ add v11.2d, v11.2d, v3.2d
+
+ cond_yield 3f, x4, x5
+ /* handled all input blocks? */
+ cbnz x2, 0b
+
+ /* store new state */
+3: st1 {v8.2d-v11.2d}, [x0]
+ mov x0, x2
+ ret
+SYM_FUNC_END(__sha512_ce_transform)
diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h
new file mode 100644
index 000000000000..7eb7ef04d268
--- /dev/null
+++ b/lib/crypto/arm64/sha512.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * arm64-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns);
+
+asmlinkage void sha512_block_data_order(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage size_t __sha512_ce_transform(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_sha512_insns) &&
+ likely(may_use_simd())) {
+ do {
+ size_t rem;
+
+ scoped_ksimd()
+ rem = __sha512_ce_transform(state, data, nblocks);
+
+ data += (nblocks - rem) * SHA512_BLOCK_SIZE;
+ nblocks = rem;
+ } while (nblocks);
+ } else {
+ sha512_block_data_order(state, data, nblocks);
+ }
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(SHA512))
+ static_branch_enable(&have_sha512_insns);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/blake2b.c b/lib/crypto/blake2b.c
new file mode 100644
index 000000000000..09c6d65d8a6e
--- /dev/null
+++ b/lib/crypto/blake2b.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2025 Google LLC
+ *
+ * This is an implementation of the BLAKE2b hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ */
+
+#include <crypto/blake2b.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+static const u8 blake2b_sigma[12][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
+static inline void blake2b_increment_counter(struct blake2b_ctx *ctx, u32 inc)
+{
+ ctx->t[0] += inc;
+ ctx->t[1] += (ctx->t[0] < inc);
+}
+
+static void __maybe_unused
+blake2b_compress_generic(struct blake2b_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc)
+{
+ u64 m[16];
+ u64 v[16];
+ int i;
+
+ WARN_ON(IS_ENABLED(DEBUG) &&
+ (nblocks > 1 && inc != BLAKE2B_BLOCK_SIZE));
+
+ while (nblocks > 0) {
+ blake2b_increment_counter(ctx, inc);
+ memcpy(m, data, BLAKE2B_BLOCK_SIZE);
+ le64_to_cpu_array(m, ARRAY_SIZE(m));
+ memcpy(v, ctx->h, 64);
+ v[ 8] = BLAKE2B_IV0;
+ v[ 9] = BLAKE2B_IV1;
+ v[10] = BLAKE2B_IV2;
+ v[11] = BLAKE2B_IV3;
+ v[12] = BLAKE2B_IV4 ^ ctx->t[0];
+ v[13] = BLAKE2B_IV5 ^ ctx->t[1];
+ v[14] = BLAKE2B_IV6 ^ ctx->f[0];
+ v[15] = BLAKE2B_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+ a += b + m[blake2b_sigma[r][2 * i + 0]]; \
+ d = ror64(d ^ a, 32); \
+ c += d; \
+ b = ror64(b ^ c, 24); \
+ a += b + m[blake2b_sigma[r][2 * i + 1]]; \
+ d = ror64(d ^ a, 16); \
+ c += d; \
+ b = ror64(b ^ c, 63); \
+} while (0)
+
+#define ROUND(r) do { \
+ G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+ G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+ G(r, 2, v[2], v[ 6], v[10], v[14]); \
+ G(r, 3, v[3], v[ 7], v[11], v[15]); \
+ G(r, 4, v[0], v[ 5], v[10], v[15]); \
+ G(r, 5, v[1], v[ 6], v[11], v[12]); \
+ G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+ G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+ ROUND(10);
+ ROUND(11);
+
+#undef G
+#undef ROUND
+
+ for (i = 0; i < 8; ++i)
+ ctx->h[i] ^= v[i] ^ v[i + 8];
+
+ data += BLAKE2B_BLOCK_SIZE;
+ --nblocks;
+ }
+}
+
+#ifdef CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+#include "blake2b.h" /* $(SRCARCH)/blake2b.h */
+#else
+#define blake2b_compress blake2b_compress_generic
+#endif
+
+static inline void blake2b_set_lastblock(struct blake2b_ctx *ctx)
+{
+ ctx->f[0] = -1;
+}
+
+void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen)
+{
+ const size_t fill = BLAKE2B_BLOCK_SIZE - ctx->buflen;
+
+ if (unlikely(!inlen))
+ return;
+ if (inlen > fill) {
+ memcpy(ctx->buf + ctx->buflen, in, fill);
+ blake2b_compress(ctx, ctx->buf, 1, BLAKE2B_BLOCK_SIZE);
+ ctx->buflen = 0;
+ in += fill;
+ inlen -= fill;
+ }
+ if (inlen > BLAKE2B_BLOCK_SIZE) {
+ const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2B_BLOCK_SIZE);
+
+ blake2b_compress(ctx, in, nblocks - 1, BLAKE2B_BLOCK_SIZE);
+ in += BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+ inlen -= BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+ }
+ memcpy(ctx->buf + ctx->buflen, in, inlen);
+ ctx->buflen += inlen;
+}
+EXPORT_SYMBOL(blake2b_update);
+
+void blake2b_final(struct blake2b_ctx *ctx, u8 *out)
+{
+ WARN_ON(IS_ENABLED(DEBUG) && !out);
+ blake2b_set_lastblock(ctx);
+ memset(ctx->buf + ctx->buflen, 0,
+ BLAKE2B_BLOCK_SIZE - ctx->buflen); /* Padding */
+ blake2b_compress(ctx, ctx->buf, 1, ctx->buflen);
+ cpu_to_le64_array(ctx->h, ARRAY_SIZE(ctx->h));
+ memcpy(out, ctx->h, ctx->outlen);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(blake2b_final);
+
+#ifdef blake2b_mod_init_arch
+static int __init blake2b_mod_init(void)
+{
+ blake2b_mod_init_arch();
+ return 0;
+}
+subsys_initcall(blake2b_mod_init);
+
+static void __exit blake2b_mod_exit(void)
+{
+}
+module_exit(blake2b_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("BLAKE2b hash function");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
deleted file mode 100644
index 09682136b57c..000000000000
--- a/lib/crypto/blake2s-generic.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the BLAKE2s hash and PRF functions.
- *
- * Information: https://blake2.net/
- *
- */
-
-#include <crypto/internal/blake2s.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/bug.h>
-#include <linux/unaligned.h>
-
-static const u8 blake2s_sigma[10][16] = {
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
- { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
- { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
- { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
- { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
- { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
- { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
- { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
- { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
- { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-};
-
-static inline void blake2s_increment_counter(struct blake2s_state *state,
- const u32 inc)
-{
- state->t[0] += inc;
- state->t[1] += (state->t[0] < inc);
-}
-
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
- __weak __alias(blake2s_compress_generic);
-
-void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
-{
- u32 m[16];
- u32 v[16];
- int i;
-
- WARN_ON(IS_ENABLED(DEBUG) &&
- (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
-
- while (nblocks > 0) {
- blake2s_increment_counter(state, inc);
- memcpy(m, block, BLAKE2S_BLOCK_SIZE);
- le32_to_cpu_array(m, ARRAY_SIZE(m));
- memcpy(v, state->h, 32);
- v[ 8] = BLAKE2S_IV0;
- v[ 9] = BLAKE2S_IV1;
- v[10] = BLAKE2S_IV2;
- v[11] = BLAKE2S_IV3;
- v[12] = BLAKE2S_IV4 ^ state->t[0];
- v[13] = BLAKE2S_IV5 ^ state->t[1];
- v[14] = BLAKE2S_IV6 ^ state->f[0];
- v[15] = BLAKE2S_IV7 ^ state->f[1];
-
-#define G(r, i, a, b, c, d) do { \
- a += b + m[blake2s_sigma[r][2 * i + 0]]; \
- d = ror32(d ^ a, 16); \
- c += d; \
- b = ror32(b ^ c, 12); \
- a += b + m[blake2s_sigma[r][2 * i + 1]]; \
- d = ror32(d ^ a, 8); \
- c += d; \
- b = ror32(b ^ c, 7); \
-} while (0)
-
-#define ROUND(r) do { \
- G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
- G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
- G(r, 2, v[2], v[ 6], v[10], v[14]); \
- G(r, 3, v[3], v[ 7], v[11], v[15]); \
- G(r, 4, v[0], v[ 5], v[10], v[15]); \
- G(r, 5, v[1], v[ 6], v[11], v[12]); \
- G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
- G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
-} while (0)
- ROUND(0);
- ROUND(1);
- ROUND(2);
- ROUND(3);
- ROUND(4);
- ROUND(5);
- ROUND(6);
- ROUND(7);
- ROUND(8);
- ROUND(9);
-
-#undef G
-#undef ROUND
-
- for (i = 0; i < 8; ++i)
- state->h[i] ^= v[i] ^ v[i + 8];
-
- block += BLAKE2S_BLOCK_SIZE;
- --nblocks;
- }
-}
-
-EXPORT_SYMBOL(blake2s_compress_generic);
diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c
deleted file mode 100644
index d0634ed6a937..000000000000
--- a/lib/crypto/blake2s-selftest.c
+++ /dev/null
@@ -1,651 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <crypto/internal/blake2s.h>
-#include <linux/kernel.h>
-#include <linux/random.h>
-#include <linux/string.h>
-
-/*
- * blake2s_testvecs[] generated with the program below (using libb2-dev and
- * libssl-dev [OpenSSL])
- *
- * #include <blake2.h>
- * #include <stdint.h>
- * #include <stdio.h>
- *
- * #include <openssl/evp.h>
- *
- * #define BLAKE2S_TESTVEC_COUNT 256
- *
- * static void print_vec(const uint8_t vec[], int len)
- * {
- * int i;
- *
- * printf(" { ");
- * for (i = 0; i < len; i++) {
- * if (i && (i % 12) == 0)
- * printf("\n ");
- * printf("0x%02x, ", vec[i]);
- * }
- * printf("},\n");
- * }
- *
- * int main(void)
- * {
- * uint8_t key[BLAKE2S_KEYBYTES];
- * uint8_t buf[BLAKE2S_TESTVEC_COUNT];
- * uint8_t hash[BLAKE2S_OUTBYTES];
- * int i, j;
- *
- * key[0] = key[1] = 1;
- * for (i = 2; i < BLAKE2S_KEYBYTES; ++i)
- * key[i] = key[i - 2] + key[i - 1];
- *
- * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i)
- * buf[i] = (uint8_t)i;
- *
- * printf("static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
- *
- * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) {
- * int outlen = 1 + i % BLAKE2S_OUTBYTES;
- * int keylen = (13 * i) % (BLAKE2S_KEYBYTES + 1);
- *
- * blake2s(hash, buf, key + BLAKE2S_KEYBYTES - keylen, outlen, i,
- * keylen);
- * print_vec(hash, outlen);
- * }
- * printf("};\n\n");
- *
- * return 0;
- *}
- */
-static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
- { 0xa1, },
- { 0x7c, 0x89, },
- { 0x74, 0x0e, 0xd4, },
- { 0x47, 0x0c, 0x21, 0x15, },
- { 0x18, 0xd6, 0x9c, 0xa6, 0xc4, },
- { 0x13, 0x5d, 0x16, 0x63, 0x2e, 0xf9, },
- { 0x2c, 0xb5, 0x04, 0xb7, 0x99, 0xe2, 0x73, },
- { 0x9a, 0x0f, 0xd2, 0x39, 0xd6, 0x68, 0x1b, 0x92, },
- { 0xc8, 0xde, 0x7a, 0xea, 0x2f, 0xf4, 0xd2, 0xe3, 0x2b, },
- { 0x5b, 0xf9, 0x43, 0x52, 0x0c, 0x12, 0xba, 0xb5, 0x93, 0x9f, },
- { 0xc6, 0x2c, 0x4e, 0x80, 0xfc, 0x32, 0x5b, 0x33, 0xb8, 0xb8, 0x0a, },
- { 0xa7, 0x5c, 0xfd, 0x3a, 0xcc, 0xbf, 0x90, 0xca, 0xb7, 0x97, 0xde, 0xd8, },
- { 0x66, 0xca, 0x3c, 0xc4, 0x19, 0xef, 0x92, 0x66, 0x3f, 0x21, 0x8f, 0xda,
- 0xb7, },
- { 0xba, 0xe5, 0xbb, 0x30, 0x25, 0x94, 0x6d, 0xc3, 0x89, 0x09, 0xc4, 0x25,
- 0x52, 0x3e, },
- { 0xa2, 0xef, 0x0e, 0x52, 0x0b, 0x5f, 0xa2, 0x01, 0x6d, 0x0a, 0x25, 0xbc,
- 0x57, 0xe2, 0x27, },
- { 0x4f, 0xe0, 0xf9, 0x52, 0x12, 0xda, 0x84, 0xb7, 0xab, 0xae, 0xb0, 0xa6,
- 0x47, 0x2a, 0xc7, 0xf5, },
- { 0x56, 0xe7, 0xa8, 0x1c, 0x4c, 0xca, 0xed, 0x90, 0x31, 0xec, 0x87, 0x43,
- 0xe7, 0x72, 0x08, 0xec, 0xbe, },
- { 0x7e, 0xdf, 0x80, 0x1c, 0x93, 0x33, 0xfd, 0x53, 0x44, 0xba, 0xfd, 0x96,
- 0xe1, 0xbb, 0xb5, 0x65, 0xa5, 0x00, },
- { 0xec, 0x6b, 0xed, 0xf7, 0x7b, 0x62, 0x1d, 0x7d, 0xf4, 0x82, 0xf3, 0x1e,
- 0x18, 0xff, 0x2b, 0xc4, 0x06, 0x20, 0x2a, },
- { 0x74, 0x98, 0xd7, 0x68, 0x63, 0xed, 0x87, 0xe4, 0x5d, 0x8d, 0x9e, 0x1d,
- 0xfd, 0x2a, 0xbb, 0x86, 0xac, 0xe9, 0x2a, 0x89, },
- { 0x89, 0xc3, 0x88, 0xce, 0x2b, 0x33, 0x1e, 0x10, 0xd1, 0x37, 0x20, 0x86,
- 0x28, 0x43, 0x70, 0xd9, 0xfb, 0x96, 0xd9, 0xb5, 0xd3, },
- { 0xcb, 0x56, 0x74, 0x41, 0x8d, 0x80, 0x01, 0x9a, 0x6b, 0x38, 0xe1, 0x41,
- 0xad, 0x9c, 0x62, 0x74, 0xce, 0x35, 0xd5, 0x6c, 0x89, 0x6e, },
- { 0x79, 0xaf, 0x94, 0x59, 0x99, 0x26, 0xe1, 0xc9, 0x34, 0xfe, 0x7c, 0x22,
- 0xf7, 0x43, 0xd7, 0x65, 0xd4, 0x48, 0x18, 0xac, 0x3d, 0xfd, 0x93, },
- { 0x85, 0x0d, 0xff, 0xb8, 0x3e, 0x87, 0x41, 0xb0, 0x95, 0xd3, 0x3d, 0x00,
- 0x47, 0x55, 0x9e, 0xd2, 0x69, 0xea, 0xbf, 0xe9, 0x7a, 0x2d, 0x61, 0x45, },
- { 0x03, 0xe0, 0x85, 0xec, 0x54, 0xb5, 0x16, 0x53, 0xa8, 0xc4, 0x71, 0xe9,
- 0x6a, 0xe7, 0xcb, 0xc4, 0x15, 0x02, 0xfc, 0x34, 0xa4, 0xa4, 0x28, 0x13,
- 0xd1, },
- { 0xe3, 0x34, 0x4b, 0xe1, 0xd0, 0x4b, 0x55, 0x61, 0x8f, 0xc0, 0x24, 0x05,
- 0xe6, 0xe0, 0x3d, 0x70, 0x24, 0x4d, 0xda, 0xb8, 0x91, 0x05, 0x29, 0x07,
- 0x01, 0x3e, },
- { 0x61, 0xff, 0x01, 0x72, 0xb1, 0x4d, 0xf6, 0xfe, 0xd1, 0xd1, 0x08, 0x74,
- 0xe6, 0x91, 0x44, 0xeb, 0x61, 0xda, 0x40, 0xaf, 0xfc, 0x8c, 0x91, 0x6b,
- 0xec, 0x13, 0xed, },
- { 0xd4, 0x40, 0xd2, 0xa0, 0x7f, 0xc1, 0x58, 0x0c, 0x85, 0xa0, 0x86, 0xc7,
- 0x86, 0xb9, 0x61, 0xc9, 0xea, 0x19, 0x86, 0x1f, 0xab, 0x07, 0xce, 0x37,
- 0x72, 0x67, 0x09, 0xfc, },
- { 0x9e, 0xf8, 0x18, 0x67, 0x93, 0x10, 0x9b, 0x39, 0x75, 0xe8, 0x8b, 0x38,
- 0x82, 0x7d, 0xb8, 0xb7, 0xa5, 0xaf, 0xe6, 0x6a, 0x22, 0x5e, 0x1f, 0x9c,
- 0x95, 0x29, 0x19, 0xf2, 0x4b, },
- { 0xc8, 0x62, 0x25, 0xf5, 0x98, 0xc9, 0xea, 0xe5, 0x29, 0x3a, 0xd3, 0x22,
- 0xeb, 0xeb, 0x07, 0x7c, 0x15, 0x07, 0xee, 0x15, 0x61, 0xbb, 0x05, 0x30,
- 0x99, 0x7f, 0x11, 0xf6, 0x0a, 0x1d, },
- { 0x68, 0x70, 0xf7, 0x90, 0xa1, 0x8b, 0x1f, 0x0f, 0xbb, 0xce, 0xd2, 0x0e,
- 0x33, 0x1f, 0x7f, 0xa9, 0x78, 0xa8, 0xa6, 0x81, 0x66, 0xab, 0x8d, 0xcd,
- 0x58, 0x55, 0x3a, 0x0b, 0x7a, 0xdb, 0xb5, },
- { 0xdd, 0x35, 0xd2, 0xb4, 0xf6, 0xc7, 0xea, 0xab, 0x64, 0x24, 0x4e, 0xfe,
- 0xe5, 0x3d, 0x4e, 0x95, 0x8b, 0x6d, 0x6c, 0xbc, 0xb0, 0xf8, 0x88, 0x61,
- 0x09, 0xb7, 0x78, 0xa3, 0x31, 0xfe, 0xd9, 0x2f, },
- { 0x0a, },
- { 0x6e, 0xd4, },
- { 0x64, 0xe9, 0xd1, },
- { 0x30, 0xdd, 0x71, 0xef, },
- { 0x11, 0xb5, 0x0c, 0x87, 0xc9, },
- { 0x06, 0x1c, 0x6d, 0x04, 0x82, 0xd0, },
- { 0x5c, 0x42, 0x0b, 0xee, 0xc5, 0x9c, 0xb2, },
- { 0xe8, 0x29, 0xd6, 0xb4, 0x5d, 0xf7, 0x2b, 0x93, },
- { 0x18, 0xca, 0x27, 0x72, 0x43, 0x39, 0x16, 0xbc, 0x6a, },
- { 0x39, 0x8f, 0xfd, 0x64, 0xf5, 0x57, 0x23, 0xb0, 0x45, 0xf8, },
- { 0xbb, 0x3a, 0x78, 0x6b, 0x02, 0x1d, 0x0b, 0x16, 0xe3, 0xb2, 0x9a, },
- { 0xb8, 0xb4, 0x0b, 0xe5, 0xd4, 0x1d, 0x0d, 0x85, 0x49, 0x91, 0x35, 0xfa, },
- { 0x6d, 0x48, 0x2a, 0x0c, 0x42, 0x08, 0xbd, 0xa9, 0x78, 0x6f, 0x18, 0xaf,
- 0xe2, },
- { 0x10, 0x45, 0xd4, 0x58, 0x88, 0xec, 0x4e, 0x1e, 0xf6, 0x14, 0x92, 0x64,
- 0x7e, 0xb0, },
- { 0x8b, 0x0b, 0x95, 0xee, 0x92, 0xc6, 0x3b, 0x91, 0xf1, 0x1e, 0xeb, 0x51,
- 0x98, 0x0a, 0x8d, },
- { 0xa3, 0x50, 0x4d, 0xa5, 0x1d, 0x03, 0x68, 0xe9, 0x57, 0x78, 0xd6, 0x04,
- 0xf1, 0xc3, 0x94, 0xd8, },
- { 0xb8, 0x66, 0x6e, 0xdd, 0x46, 0x15, 0xae, 0x3d, 0x83, 0x7e, 0xcf, 0xe7,
- 0x2c, 0xe8, 0x8f, 0xc7, 0x34, },
- { 0x2e, 0xc0, 0x1f, 0x29, 0xea, 0xf6, 0xb9, 0xe2, 0xc2, 0x93, 0xeb, 0x41,
- 0x0d, 0xf0, 0x0a, 0x13, 0x0e, 0xa2, },
- { 0x71, 0xb8, 0x33, 0xa9, 0x1b, 0xac, 0xf1, 0xb5, 0x42, 0x8f, 0x5e, 0x81,
- 0x34, 0x43, 0xb7, 0xa4, 0x18, 0x5c, 0x47, },
- { 0xda, 0x45, 0xb8, 0x2e, 0x82, 0x1e, 0xc0, 0x59, 0x77, 0x9d, 0xfa, 0xb4,
- 0x1c, 0x5e, 0xa0, 0x2b, 0x33, 0x96, 0x5a, 0x58, },
- { 0xe3, 0x09, 0x05, 0xa9, 0xeb, 0x48, 0x13, 0xad, 0x71, 0x88, 0x81, 0x9a,
- 0x3e, 0x2c, 0xe1, 0x23, 0x99, 0x13, 0x35, 0x9f, 0xb5, },
- { 0xb7, 0x86, 0x2d, 0x16, 0xe1, 0x04, 0x00, 0x47, 0x47, 0x61, 0x31, 0xfb,
- 0x14, 0xac, 0xd8, 0xe9, 0xe3, 0x49, 0xbd, 0xf7, 0x9c, 0x3f, },
- { 0x7f, 0xd9, 0x95, 0xa8, 0xa7, 0xa0, 0xcc, 0xba, 0xef, 0xb1, 0x0a, 0xa9,
- 0x21, 0x62, 0x08, 0x0f, 0x1b, 0xff, 0x7b, 0x9d, 0xae, 0xb2, 0x95, },
- { 0x85, 0x99, 0xea, 0x33, 0xe0, 0x56, 0xff, 0x13, 0xc6, 0x61, 0x8c, 0xf9,
- 0x57, 0x05, 0x03, 0x11, 0xf9, 0xfb, 0x3a, 0xf7, 0xce, 0xbb, 0x52, 0x30, },
- { 0xb2, 0x72, 0x9c, 0xf8, 0x77, 0x4e, 0x8f, 0x6b, 0x01, 0x6c, 0xff, 0x4e,
- 0x4f, 0x02, 0xd2, 0xbc, 0xeb, 0x51, 0x28, 0x99, 0x50, 0xab, 0xc4, 0x42,
- 0xe3, },
- { 0x8b, 0x0a, 0xb5, 0x90, 0x8f, 0xf5, 0x7b, 0xdd, 0xba, 0x47, 0x37, 0xc9,
- 0x2a, 0xd5, 0x4b, 0x25, 0x08, 0x8b, 0x02, 0x17, 0xa7, 0x9e, 0x6b, 0x6e,
- 0xe3, 0x90, },
- { 0x90, 0xdd, 0xf7, 0x75, 0xa7, 0xa3, 0x99, 0x5e, 0x5b, 0x7d, 0x75, 0xc3,
- 0x39, 0x6b, 0xa0, 0xe2, 0x44, 0x53, 0xb1, 0x9e, 0xc8, 0xf1, 0x77, 0x10,
- 0x58, 0x06, 0x9a, },
- { 0x99, 0x52, 0xf0, 0x49, 0xa8, 0x8c, 0xec, 0xa6, 0x97, 0x32, 0x13, 0xb5,
- 0xf7, 0xa3, 0x8e, 0xfb, 0x4b, 0x59, 0x31, 0x3d, 0x01, 0x59, 0x98, 0x5d,
- 0x53, 0x03, 0x1a, 0x39, },
- { 0x9f, 0xe0, 0xc2, 0xe5, 0x5d, 0x93, 0xd6, 0x9b, 0x47, 0x8f, 0x9b, 0xe0,
- 0x26, 0x35, 0x84, 0x20, 0x1d, 0xc5, 0x53, 0x10, 0x0f, 0x22, 0xb9, 0xb5,
- 0xd4, 0x36, 0xb1, 0xac, 0x73, },
- { 0x30, 0x32, 0x20, 0x3b, 0x10, 0x28, 0xec, 0x1f, 0x4f, 0x9b, 0x47, 0x59,
- 0xeb, 0x7b, 0xee, 0x45, 0xfb, 0x0c, 0x49, 0xd8, 0x3d, 0x69, 0xbd, 0x90,
- 0x2c, 0xf0, 0x9e, 0x8d, 0xbf, 0xd5, },
- { 0x2a, 0x37, 0x73, 0x7f, 0xf9, 0x96, 0x19, 0xaa, 0x25, 0xd8, 0x13, 0x28,
- 0x01, 0x29, 0x89, 0xdf, 0x6e, 0x0c, 0x9b, 0x43, 0x44, 0x51, 0xe9, 0x75,
- 0x26, 0x0c, 0xb7, 0x87, 0x66, 0x0b, 0x5f, },
- { 0x23, 0xdf, 0x96, 0x68, 0x91, 0x86, 0xd0, 0x93, 0x55, 0x33, 0x24, 0xf6,
- 0xba, 0x08, 0x75, 0x5b, 0x59, 0x11, 0x69, 0xb8, 0xb9, 0xe5, 0x2c, 0x77,
- 0x02, 0xf6, 0x47, 0xee, 0x81, 0xdd, 0xb9, 0x06, },
- { 0x9d, },
- { 0x9d, 0x7d, },
- { 0xfd, 0xc3, 0xda, },
- { 0xe8, 0x82, 0xcd, 0x21, },
- { 0xc3, 0x1d, 0x42, 0x4c, 0x74, },
- { 0xe9, 0xda, 0xf1, 0xa2, 0xe5, 0x7c, },
- { 0x52, 0xb8, 0x6f, 0x81, 0x5c, 0x3a, 0x4c, },
- { 0x5b, 0x39, 0x26, 0xfc, 0x92, 0x5e, 0xe0, 0x49, },
- { 0x59, 0xe4, 0x7c, 0x93, 0x1c, 0xf9, 0x28, 0x93, 0xde, },
- { 0xde, 0xdf, 0xb2, 0x43, 0x61, 0x0b, 0x86, 0x16, 0x4c, 0x2e, },
- { 0x14, 0x8f, 0x75, 0x51, 0xaf, 0xb9, 0xee, 0x51, 0x5a, 0xae, 0x23, },
- { 0x43, 0x5f, 0x50, 0xd5, 0x70, 0xb0, 0x5b, 0x87, 0xf5, 0xd9, 0xb3, 0x6d, },
- { 0x66, 0x0a, 0x64, 0x93, 0x79, 0x71, 0x94, 0x40, 0xb7, 0x68, 0x2d, 0xd3,
- 0x63, },
- { 0x15, 0x00, 0xc4, 0x0c, 0x7d, 0x1b, 0x10, 0xa9, 0x73, 0x1b, 0x90, 0x6f,
- 0xe6, 0xa9, },
- { 0x34, 0x75, 0xf3, 0x86, 0x8f, 0x56, 0xcf, 0x2a, 0x0a, 0xf2, 0x62, 0x0a,
- 0xf6, 0x0e, 0x20, },
- { 0xb1, 0xde, 0xc9, 0xf5, 0xdb, 0xf3, 0x2f, 0x4c, 0xd6, 0x41, 0x7d, 0x39,
- 0x18, 0x3e, 0xc7, 0xc3, },
- { 0xc5, 0x89, 0xb2, 0xf8, 0xb8, 0xc0, 0xa3, 0xb9, 0x3b, 0x10, 0x6d, 0x7c,
- 0x92, 0xfc, 0x7f, 0x34, 0x41, },
- { 0xc4, 0xd8, 0xef, 0xba, 0xef, 0xd2, 0xaa, 0xc5, 0x6c, 0x8e, 0x3e, 0xbb,
- 0x12, 0xfc, 0x0f, 0x72, 0xbf, 0x0f, },
- { 0xdd, 0x91, 0xd1, 0x15, 0x9e, 0x7d, 0xf8, 0xc1, 0xb9, 0x14, 0x63, 0x96,
- 0xb5, 0xcb, 0x83, 0x1d, 0x35, 0x1c, 0xec, },
- { 0xa9, 0xf8, 0x52, 0xc9, 0x67, 0x76, 0x2b, 0xad, 0xfb, 0xd8, 0x3a, 0xa6,
- 0x74, 0x02, 0xae, 0xb8, 0x25, 0x2c, 0x63, 0x49, },
- { 0x77, 0x1f, 0x66, 0x70, 0xfd, 0x50, 0x29, 0xaa, 0xeb, 0xdc, 0xee, 0xba,
- 0x75, 0x98, 0xdc, 0x93, 0x12, 0x3f, 0xdc, 0x7c, 0x38, },
- { 0xe2, 0xe1, 0x89, 0x5c, 0x37, 0x38, 0x6a, 0xa3, 0x40, 0xac, 0x3f, 0xb0,
- 0xca, 0xfc, 0xa7, 0xf3, 0xea, 0xf9, 0x0f, 0x5d, 0x8e, 0x39, },
- { 0x0f, 0x67, 0xc8, 0x38, 0x01, 0xb1, 0xb7, 0xb8, 0xa2, 0xe7, 0x0a, 0x6d,
- 0xd2, 0x63, 0x69, 0x9e, 0xcc, 0xf0, 0xf2, 0xbe, 0x9b, 0x98, 0xdd, },
- { 0x13, 0xe1, 0x36, 0x30, 0xfe, 0xc6, 0x01, 0x8a, 0xa1, 0x63, 0x96, 0x59,
- 0xc2, 0xa9, 0x68, 0x3f, 0x58, 0xd4, 0x19, 0x0c, 0x40, 0xf3, 0xde, 0x02, },
- { 0xa3, 0x9e, 0xce, 0xda, 0x42, 0xee, 0x8c, 0x6c, 0x5a, 0x7d, 0xdc, 0x89,
- 0x02, 0x77, 0xdd, 0xe7, 0x95, 0xbb, 0xff, 0x0d, 0xa4, 0xb5, 0x38, 0x1e,
- 0xaf, },
- { 0x9a, 0xf6, 0xb5, 0x9a, 0x4f, 0xa9, 0x4f, 0x2c, 0x35, 0x3c, 0x24, 0xdc,
- 0x97, 0x6f, 0xd9, 0xa1, 0x7d, 0x1a, 0x85, 0x0b, 0xf5, 0xda, 0x2e, 0xe7,
- 0xb1, 0x1d, },
- { 0x84, 0x1e, 0x8e, 0x3d, 0x45, 0xa5, 0xf2, 0x27, 0xf3, 0x31, 0xfe, 0xb9,
- 0xfb, 0xc5, 0x45, 0x99, 0x99, 0xdd, 0x93, 0x43, 0x02, 0xee, 0x58, 0xaf,
- 0xee, 0x6a, 0xbe, },
- { 0x07, 0x2f, 0xc0, 0xa2, 0x04, 0xc4, 0xab, 0x7c, 0x26, 0xbb, 0xa8, 0xd8,
- 0xe3, 0x1c, 0x75, 0x15, 0x64, 0x5d, 0x02, 0x6a, 0xf0, 0x86, 0xe9, 0xcd,
- 0x5c, 0xef, 0xa3, 0x25, },
- { 0x2f, 0x3b, 0x1f, 0xb5, 0x91, 0x8f, 0x86, 0xe0, 0xdc, 0x31, 0x48, 0xb6,
- 0xa1, 0x8c, 0xfd, 0x75, 0xbb, 0x7d, 0x3d, 0xc1, 0xf0, 0x10, 0x9a, 0xd8,
- 0x4b, 0x0e, 0xe3, 0x94, 0x9f, },
- { 0x29, 0xbb, 0x8f, 0x6c, 0xd1, 0xf2, 0xb6, 0xaf, 0xe5, 0xe3, 0x2d, 0xdc,
- 0x6f, 0xa4, 0x53, 0x88, 0xd8, 0xcf, 0x4d, 0x45, 0x42, 0x62, 0xdb, 0xdf,
- 0xf8, 0x45, 0xc2, 0x13, 0xec, 0x35, },
- { 0x06, 0x3c, 0xe3, 0x2c, 0x15, 0xc6, 0x43, 0x03, 0x81, 0xfb, 0x08, 0x76,
- 0x33, 0xcb, 0x02, 0xc1, 0xba, 0x33, 0xe5, 0xe0, 0xd1, 0x92, 0xa8, 0x46,
- 0x28, 0x3f, 0x3e, 0x9d, 0x2c, 0x44, 0x54, },
- { 0xea, 0xbb, 0x96, 0xf8, 0xd1, 0x8b, 0x04, 0x11, 0x40, 0x78, 0x42, 0x02,
- 0x19, 0xd1, 0xbc, 0x65, 0x92, 0xd3, 0xc3, 0xd6, 0xd9, 0x19, 0xe7, 0xc3,
- 0x40, 0x97, 0xbd, 0xd4, 0xed, 0xfa, 0x5e, 0x28, },
- { 0x02, },
- { 0x52, 0xa8, },
- { 0x38, 0x25, 0x0d, },
- { 0xe3, 0x04, 0xd4, 0x92, },
- { 0x97, 0xdb, 0xf7, 0x81, 0xca, },
- { 0x8a, 0x56, 0x9d, 0x62, 0x56, 0xcc, },
- { 0xa1, 0x8e, 0x3c, 0x72, 0x8f, 0x63, 0x03, },
- { 0xf7, 0xf3, 0x39, 0x09, 0x0a, 0xa1, 0xbb, 0x23, },
- { 0x6b, 0x03, 0xc0, 0xe9, 0xd9, 0x83, 0x05, 0x22, 0x01, },
- { 0x1b, 0x4b, 0xf5, 0xd6, 0x4f, 0x05, 0x75, 0x91, 0x4c, 0x7f, },
- { 0x4c, 0x8c, 0x25, 0x20, 0x21, 0xcb, 0xc2, 0x4b, 0x3a, 0x5b, 0x8d, },
- { 0x56, 0xe2, 0x77, 0xa0, 0xb6, 0x9f, 0x81, 0xec, 0x83, 0x75, 0xc4, 0xf9, },
- { 0x71, 0x70, 0x0f, 0xad, 0x4d, 0x35, 0x81, 0x9d, 0x88, 0x69, 0xf9, 0xaa,
- 0xd3, },
- { 0x50, 0x6e, 0x86, 0x6e, 0x43, 0xc0, 0xc2, 0x44, 0xc2, 0xe2, 0xa0, 0x1c,
- 0xb7, 0x9a, },
- { 0xe4, 0x7e, 0x72, 0xc6, 0x12, 0x8e, 0x7c, 0xfc, 0xbd, 0xe2, 0x08, 0x31,
- 0x3d, 0x47, 0x3d, },
- { 0x08, 0x97, 0x5b, 0x80, 0xae, 0xc4, 0x1d, 0x50, 0x77, 0xdf, 0x1f, 0xd0,
- 0x24, 0xf0, 0x17, 0xc0, },
- { 0x01, 0xb6, 0x29, 0xf4, 0xaf, 0x78, 0x5f, 0xb6, 0x91, 0xdd, 0x76, 0x76,
- 0xd2, 0xfd, 0x0c, 0x47, 0x40, },
- { 0xa1, 0xd8, 0x09, 0x97, 0x7a, 0xa6, 0xc8, 0x94, 0xf6, 0x91, 0x7b, 0xae,
- 0x2b, 0x9f, 0x0d, 0x83, 0x48, 0xf7, },
- { 0x12, 0xd5, 0x53, 0x7d, 0x9a, 0xb0, 0xbe, 0xd9, 0xed, 0xe9, 0x9e, 0xee,
- 0x61, 0x5b, 0x42, 0xf2, 0xc0, 0x73, 0xc0, },
- { 0xd5, 0x77, 0xd6, 0x5c, 0x6e, 0xa5, 0x69, 0x2b, 0x3b, 0x8c, 0xd6, 0x7d,
- 0x1d, 0xbe, 0x2c, 0xa1, 0x02, 0x21, 0xcd, 0x29, },
- { 0xa4, 0x98, 0x80, 0xca, 0x22, 0xcf, 0x6a, 0xab, 0x5e, 0x40, 0x0d, 0x61,
- 0x08, 0x21, 0xef, 0xc0, 0x6c, 0x52, 0xb4, 0xb0, 0x53, },
- { 0xbf, 0xaf, 0x8f, 0x3b, 0x7a, 0x97, 0x33, 0xe5, 0xca, 0x07, 0x37, 0xfd,
- 0x15, 0xdf, 0xce, 0x26, 0x2a, 0xb1, 0xa7, 0x0b, 0xb3, 0xac, },
- { 0x16, 0x22, 0xe1, 0xbc, 0x99, 0x4e, 0x01, 0xf0, 0xfa, 0xff, 0x8f, 0xa5,
- 0x0c, 0x61, 0xb0, 0xad, 0xcc, 0xb1, 0xe1, 0x21, 0x46, 0xfa, 0x2e, },
- { 0x11, 0x5b, 0x0b, 0x2b, 0xe6, 0x14, 0xc1, 0xd5, 0x4d, 0x71, 0x5e, 0x17,
- 0xea, 0x23, 0xdd, 0x6c, 0xbd, 0x1d, 0xbe, 0x12, 0x1b, 0xee, 0x4c, 0x1a, },
- { 0x40, 0x88, 0x22, 0xf3, 0x20, 0x6c, 0xed, 0xe1, 0x36, 0x34, 0x62, 0x2c,
- 0x98, 0x83, 0x52, 0xe2, 0x25, 0xee, 0xe9, 0xf5, 0xe1, 0x17, 0xf0, 0x5c,
- 0xae, },
- { 0xc3, 0x76, 0x37, 0xde, 0x95, 0x8c, 0xca, 0x2b, 0x0c, 0x23, 0xe7, 0xb5,
- 0x38, 0x70, 0x61, 0xcc, 0xff, 0xd3, 0x95, 0x7b, 0xf3, 0xff, 0x1f, 0x9d,
- 0x59, 0x00, },
- { 0x0c, 0x19, 0x52, 0x05, 0x22, 0x53, 0xcb, 0x48, 0xd7, 0x10, 0x0e, 0x7e,
- 0x14, 0x69, 0xb5, 0xa2, 0x92, 0x43, 0xa3, 0x9e, 0x4b, 0x8f, 0x51, 0x2c,
- 0x5a, 0x2c, 0x3b, },
- { 0xe1, 0x9d, 0x70, 0x70, 0x28, 0xec, 0x86, 0x40, 0x55, 0x33, 0x56, 0xda,
- 0x88, 0xca, 0xee, 0xc8, 0x6a, 0x20, 0xb1, 0xe5, 0x3d, 0x57, 0xf8, 0x3c,
- 0x10, 0x07, 0x2a, 0xc4, },
- { 0x0b, 0xae, 0xf1, 0xc4, 0x79, 0xee, 0x1b, 0x3d, 0x27, 0x35, 0x8d, 0x14,
- 0xd6, 0xae, 0x4e, 0x3c, 0xe9, 0x53, 0x50, 0xb5, 0xcc, 0x0c, 0xf7, 0xdf,
- 0xee, 0xa1, 0x74, 0xd6, 0x71, },
- { 0xe6, 0xa4, 0xf4, 0x99, 0x98, 0xb9, 0x80, 0xea, 0x96, 0x7f, 0x4f, 0x33,
- 0xcf, 0x74, 0x25, 0x6f, 0x17, 0x6c, 0xbf, 0xf5, 0x5c, 0x38, 0xd0, 0xff,
- 0x96, 0xcb, 0x13, 0xf9, 0xdf, 0xfd, },
- { 0xbe, 0x92, 0xeb, 0xba, 0x44, 0x2c, 0x24, 0x74, 0xd4, 0x03, 0x27, 0x3c,
- 0x5d, 0x5b, 0x03, 0x30, 0x87, 0x63, 0x69, 0xe0, 0xb8, 0x94, 0xf4, 0x44,
- 0x7e, 0xad, 0xcd, 0x20, 0x12, 0x16, 0x79, },
- { 0x30, 0xf1, 0xc4, 0x8e, 0x05, 0x90, 0x2a, 0x97, 0x63, 0x94, 0x46, 0xff,
- 0xce, 0xd8, 0x67, 0xa7, 0xac, 0x33, 0x8c, 0x95, 0xb7, 0xcd, 0xa3, 0x23,
- 0x98, 0x9d, 0x76, 0x6c, 0x9d, 0xa8, 0xd6, 0x8a, },
- { 0xbe, },
- { 0x17, 0x6c, },
- { 0x1a, 0x42, 0x4f, },
- { 0xba, 0xaf, 0xb7, 0x65, },
- { 0xc2, 0x63, 0x43, 0x6a, 0xea, },
- { 0xe4, 0x4d, 0xad, 0xf2, 0x0b, 0x02, },
- { 0x04, 0xc7, 0xc4, 0x7f, 0xa9, 0x2b, 0xce, },
- { 0x66, 0xf6, 0x67, 0xcb, 0x03, 0x53, 0xc8, 0xf1, },
- { 0x56, 0xa3, 0x60, 0x78, 0xc9, 0x5f, 0x70, 0x1b, 0x5e, },
- { 0x99, 0xff, 0x81, 0x7c, 0x13, 0x3c, 0x29, 0x79, 0x4b, 0x65, },
- { 0x51, 0x10, 0x50, 0x93, 0x01, 0x93, 0xb7, 0x01, 0xc9, 0x18, 0xb7, },
- { 0x8e, 0x3c, 0x42, 0x1e, 0x5e, 0x7d, 0xc1, 0x50, 0x70, 0x1f, 0x00, 0x98, },
- { 0x5f, 0xd9, 0x9b, 0xc8, 0xd7, 0xb2, 0x72, 0x62, 0x1a, 0x1e, 0xba, 0x92,
- 0xe9, },
- { 0x70, 0x2b, 0xba, 0xfe, 0xad, 0x5d, 0x96, 0x3f, 0x27, 0xc2, 0x41, 0x6d,
- 0xc4, 0xb3, },
- { 0xae, 0xe0, 0xd5, 0xd4, 0xc7, 0xae, 0x15, 0x5e, 0xdc, 0xdd, 0x33, 0x60,
- 0xd7, 0xd3, 0x5e, },
- { 0x79, 0x8e, 0xbc, 0x9e, 0x20, 0xb9, 0x19, 0x4b, 0x63, 0x80, 0xf3, 0x16,
- 0xaf, 0x39, 0xbd, 0x92, },
- { 0xc2, 0x0e, 0x85, 0xa0, 0x0b, 0x9a, 0xb0, 0xec, 0xde, 0x38, 0xd3, 0x10,
- 0xd9, 0xa7, 0x66, 0x27, 0xcf, },
- { 0x0e, 0x3b, 0x75, 0x80, 0x67, 0x14, 0x0c, 0x02, 0x90, 0xd6, 0xb3, 0x02,
- 0x81, 0xf6, 0xa6, 0x87, 0xce, 0x58, },
- { 0x79, 0xb5, 0xe9, 0x5d, 0x52, 0x4d, 0xf7, 0x59, 0xf4, 0x2e, 0x27, 0xdd,
- 0xb3, 0xed, 0x57, 0x5b, 0x82, 0xea, 0x6f, },
- { 0xa2, 0x97, 0xf5, 0x80, 0x02, 0x3d, 0xde, 0xa3, 0xf9, 0xf6, 0xab, 0xe3,
- 0x57, 0x63, 0x7b, 0x9b, 0x10, 0x42, 0x6f, 0xf2, },
- { 0x12, 0x7a, 0xfc, 0xb7, 0x67, 0x06, 0x0c, 0x78, 0x1a, 0xfe, 0x88, 0x4f,
- 0xc6, 0xac, 0x52, 0x96, 0x64, 0x28, 0x97, 0x84, 0x06, },
- { 0xc5, 0x04, 0x44, 0x6b, 0xb2, 0xa5, 0xa4, 0x66, 0xe1, 0x76, 0xa2, 0x51,
- 0xf9, 0x59, 0x69, 0x97, 0x56, 0x0b, 0xbf, 0x50, 0xb3, 0x34, },
- { 0x21, 0x32, 0x6b, 0x42, 0xb5, 0xed, 0x71, 0x8d, 0xf7, 0x5a, 0x35, 0xe3,
- 0x90, 0xe2, 0xee, 0xaa, 0x89, 0xf6, 0xc9, 0x9c, 0x4d, 0x73, 0xf4, },
- { 0x4c, 0xa6, 0x09, 0xf4, 0x48, 0xe7, 0x46, 0xbc, 0x49, 0xfc, 0xe5, 0xda,
- 0xd1, 0x87, 0x13, 0x17, 0x4c, 0x59, 0x71, 0x26, 0x5b, 0x2c, 0x42, 0xb7, },
- { 0x13, 0x63, 0xf3, 0x40, 0x02, 0xe5, 0xa3, 0x3a, 0x5e, 0x8e, 0xf8, 0xb6,
- 0x8a, 0x49, 0x60, 0x76, 0x34, 0x72, 0x94, 0x73, 0xf6, 0xd9, 0x21, 0x6a,
- 0x26, },
- { 0xdf, 0x75, 0x16, 0x10, 0x1b, 0x5e, 0x81, 0xc3, 0xc8, 0xde, 0x34, 0x24,
- 0xb0, 0x98, 0xeb, 0x1b, 0x8f, 0xa1, 0x9b, 0x05, 0xee, 0xa5, 0xe9, 0x35,
- 0xf4, 0x1d, },
- { 0xcd, 0x21, 0x93, 0x6e, 0x5b, 0xa0, 0x26, 0x2b, 0x21, 0x0e, 0xa0, 0xb9,
- 0x1c, 0xb5, 0xbb, 0xb8, 0xf8, 0x1e, 0xff, 0x5c, 0xa8, 0xf9, 0x39, 0x46,
- 0x4e, 0x29, 0x26, },
- { 0x73, 0x7f, 0x0e, 0x3b, 0x0b, 0x5c, 0xf9, 0x60, 0xaa, 0x88, 0xa1, 0x09,
- 0xb1, 0x5d, 0x38, 0x7b, 0x86, 0x8f, 0x13, 0x7a, 0x8d, 0x72, 0x7a, 0x98,
- 0x1a, 0x5b, 0xff, 0xc9, },
- { 0xd3, 0x3c, 0x61, 0x71, 0x44, 0x7e, 0x31, 0x74, 0x98, 0x9d, 0x9a, 0xd2,
- 0x27, 0xf3, 0x46, 0x43, 0x42, 0x51, 0xd0, 0x5f, 0xe9, 0x1c, 0x5c, 0x69,
- 0xbf, 0xf6, 0xbe, 0x3c, 0x40, },
- { 0x31, 0x99, 0x31, 0x9f, 0xaa, 0x43, 0x2e, 0x77, 0x3e, 0x74, 0x26, 0x31,
- 0x5e, 0x61, 0xf1, 0x87, 0xe2, 0xeb, 0x9b, 0xcd, 0xd0, 0x3a, 0xee, 0x20,
- 0x7e, 0x10, 0x0a, 0x0b, 0x7e, 0xfa, },
- { 0xa4, 0x27, 0x80, 0x67, 0x81, 0x2a, 0xa7, 0x62, 0xf7, 0x6e, 0xda, 0xd4,
- 0x5c, 0x39, 0x74, 0xad, 0x7e, 0xbe, 0xad, 0xa5, 0x84, 0x7f, 0xa9, 0x30,
- 0x5d, 0xdb, 0xe2, 0x05, 0x43, 0xf7, 0x1b, },
- { 0x0b, 0x37, 0xd8, 0x02, 0xe1, 0x83, 0xd6, 0x80, 0xf2, 0x35, 0xc2, 0xb0,
- 0x37, 0xef, 0xef, 0x5e, 0x43, 0x93, 0xf0, 0x49, 0x45, 0x0a, 0xef, 0xb5,
- 0x76, 0x70, 0x12, 0x44, 0xc4, 0xdb, 0xf5, 0x7a, },
- { 0x1f, },
- { 0x82, 0x60, },
- { 0xcc, 0xe3, 0x08, },
- { 0x56, 0x17, 0xe4, 0x59, },
- { 0xe2, 0xd7, 0x9e, 0xc4, 0x4c, },
- { 0xb2, 0xad, 0xd3, 0x78, 0x58, 0x5a, },
- { 0xce, 0x43, 0xb4, 0x02, 0x96, 0xab, 0x3c, },
- { 0xe6, 0x05, 0x1a, 0x73, 0x22, 0x32, 0xbb, 0x77, },
- { 0x23, 0xe7, 0xda, 0xfe, 0x2c, 0xef, 0x8c, 0x22, 0xec, },
- { 0xe9, 0x8e, 0x55, 0x38, 0xd1, 0xd7, 0x35, 0x23, 0x98, 0xc7, },
- { 0xb5, 0x81, 0x1a, 0xe5, 0xb5, 0xa5, 0xd9, 0x4d, 0xca, 0x41, 0xe7, },
- { 0x41, 0x16, 0x16, 0x95, 0x8d, 0x9e, 0x0c, 0xea, 0x8c, 0x71, 0x9a, 0xc1, },
- { 0x7c, 0x33, 0xc0, 0xa4, 0x00, 0x62, 0xea, 0x60, 0x67, 0xe4, 0x20, 0xbc,
- 0x5b, },
- { 0xdb, 0xb1, 0xdc, 0xfd, 0x08, 0xc0, 0xde, 0x82, 0xd1, 0xde, 0x38, 0xc0,
- 0x90, 0x48, },
- { 0x37, 0x18, 0x2e, 0x0d, 0x61, 0xaa, 0x61, 0xd7, 0x86, 0x20, 0x16, 0x60,
- 0x04, 0xd9, 0xd5, },
- { 0xb0, 0xcf, 0x2c, 0x4c, 0x5e, 0x5b, 0x4f, 0x2a, 0x23, 0x25, 0x58, 0x47,
- 0xe5, 0x31, 0x06, 0x70, },
- { 0x91, 0xa0, 0xa3, 0x86, 0x4e, 0xe0, 0x72, 0x38, 0x06, 0x67, 0x59, 0x5c,
- 0x70, 0x25, 0xdb, 0x33, 0x27, },
- { 0x44, 0x58, 0x66, 0xb8, 0x58, 0xc7, 0x13, 0xed, 0x4c, 0xc0, 0xf4, 0x9a,
- 0x1e, 0x67, 0x75, 0x33, 0xb6, 0xb8, },
- { 0x7f, 0x98, 0x4a, 0x8e, 0x50, 0xa2, 0x5c, 0xcd, 0x59, 0xde, 0x72, 0xb3,
- 0x9d, 0xc3, 0x09, 0x8a, 0xab, 0x56, 0xf1, },
- { 0x80, 0x96, 0x49, 0x1a, 0x59, 0xa2, 0xc5, 0xd5, 0xa7, 0x20, 0x8a, 0xb7,
- 0x27, 0x62, 0x84, 0x43, 0xc6, 0xe1, 0x1b, 0x5d, },
- { 0x6b, 0xb7, 0x2b, 0x26, 0x62, 0x14, 0x70, 0x19, 0x3d, 0x4d, 0xac, 0xac,
- 0x63, 0x58, 0x5e, 0x94, 0xb5, 0xb7, 0xe8, 0xe8, 0xa2, },
- { 0x20, 0xa8, 0xc0, 0xfd, 0x63, 0x3d, 0x6e, 0x98, 0xcf, 0x0c, 0x49, 0x98,
- 0xe4, 0x5a, 0xfe, 0x8c, 0xaa, 0x70, 0x82, 0x1c, 0x7b, 0x74, },
- { 0xc8, 0xe8, 0xdd, 0xdf, 0x69, 0x30, 0x01, 0xc2, 0x0f, 0x7e, 0x2f, 0x11,
- 0xcc, 0x3e, 0x17, 0xa5, 0x69, 0x40, 0x3f, 0x0e, 0x79, 0x7f, 0xcf, },
- { 0xdb, 0x61, 0xc0, 0xe2, 0x2e, 0x49, 0x07, 0x31, 0x1d, 0x91, 0x42, 0x8a,
- 0xfc, 0x5e, 0xd3, 0xf8, 0x56, 0x1f, 0x2b, 0x73, 0xfd, 0x9f, 0xb2, 0x8e, },
- { 0x0c, 0x89, 0x55, 0x0c, 0x1f, 0x59, 0x2c, 0x9d, 0x1b, 0x29, 0x1d, 0x41,
- 0x1d, 0xe6, 0x47, 0x8f, 0x8c, 0x2b, 0xea, 0x8f, 0xf0, 0xff, 0x21, 0x70,
- 0x88, },
- { 0x12, 0x18, 0x95, 0xa6, 0x59, 0xb1, 0x31, 0x24, 0x45, 0x67, 0x55, 0xa4,
- 0x1a, 0x2d, 0x48, 0x67, 0x1b, 0x43, 0x88, 0x2d, 0x8e, 0xa0, 0x70, 0xb3,
- 0xc6, 0xbb, },
- { 0xe7, 0xb1, 0x1d, 0xb2, 0x76, 0x4d, 0x68, 0x68, 0x68, 0x23, 0x02, 0x55,
- 0x3a, 0xe2, 0xe5, 0xd5, 0x4b, 0x43, 0xf9, 0x34, 0x77, 0x5c, 0xa1, 0xf5,
- 0x55, 0xfd, 0x4f, },
- { 0x8c, 0x87, 0x5a, 0x08, 0x3a, 0x73, 0xad, 0x61, 0xe1, 0xe7, 0x99, 0x7e,
- 0xf0, 0x5d, 0xe9, 0x5d, 0x16, 0x43, 0x80, 0x2f, 0xd0, 0x66, 0x34, 0xe2,
- 0x42, 0x64, 0x3b, 0x1a, },
- { 0x39, 0xc1, 0x99, 0xcf, 0x22, 0xbf, 0x16, 0x8f, 0x9f, 0x80, 0x7f, 0x95,
- 0x0a, 0x05, 0x67, 0x27, 0xe7, 0x15, 0xdf, 0x9d, 0xb2, 0xfe, 0x1c, 0xb5,
- 0x1d, 0x60, 0x8f, 0x8a, 0x1d, },
- { 0x9b, 0x6e, 0x08, 0x09, 0x06, 0x73, 0xab, 0x68, 0x02, 0x62, 0x1a, 0xe4,
- 0xd4, 0xdf, 0xc7, 0x02, 0x4c, 0x6a, 0x5f, 0xfd, 0x23, 0xac, 0xae, 0x6d,
- 0x43, 0xa4, 0x7a, 0x50, 0x60, 0x3c, },
- { 0x1d, 0xb4, 0xc6, 0xe1, 0xb1, 0x4b, 0xe3, 0xf2, 0xe2, 0x1a, 0x73, 0x1b,
- 0xa0, 0x92, 0xa7, 0xf5, 0xff, 0x8f, 0x8b, 0x5d, 0xdf, 0xa8, 0x04, 0xb3,
- 0xb0, 0xf7, 0xcc, 0x12, 0xfa, 0x35, 0x46, },
- { 0x49, 0x45, 0x97, 0x11, 0x0f, 0x1c, 0x60, 0x8e, 0xe8, 0x47, 0x30, 0xcf,
- 0x60, 0xa8, 0x71, 0xc5, 0x1b, 0xe9, 0x39, 0x4d, 0x49, 0xb6, 0x12, 0x1f,
- 0x24, 0xab, 0x37, 0xff, 0x83, 0xc2, 0xe1, 0x3a, },
- { 0x60, },
- { 0x24, 0x26, },
- { 0x47, 0xeb, 0xc9, },
- { 0x4a, 0xd0, 0xbc, 0xf0, },
- { 0x8e, 0x2b, 0xc9, 0x85, 0x3c, },
- { 0xa2, 0x07, 0x15, 0xb8, 0x12, 0x74, },
- { 0x0f, 0xdb, 0x5b, 0x33, 0x69, 0xfe, 0x4b, },
- { 0xa2, 0x86, 0x54, 0xf4, 0xfd, 0xb2, 0xd4, 0xe6, },
- { 0xbb, 0x84, 0x78, 0x49, 0x27, 0x8e, 0x61, 0xda, 0x60, },
- { 0x04, 0xc3, 0xcd, 0xaa, 0x8f, 0xa7, 0x03, 0xc9, 0xf9, 0xb6, },
- { 0xf8, 0x27, 0x1d, 0x61, 0xdc, 0x21, 0x42, 0xdd, 0xad, 0x92, 0x40, },
- { 0x12, 0x87, 0xdf, 0xc2, 0x41, 0x45, 0x5a, 0x36, 0x48, 0x5b, 0x51, 0x2b, },
- { 0xbb, 0x37, 0x5d, 0x1f, 0xf1, 0x68, 0x7a, 0xc4, 0xa5, 0xd2, 0xa4, 0x91,
- 0x8d, },
- { 0x5b, 0x27, 0xd1, 0x04, 0x54, 0x52, 0x9f, 0xa3, 0x47, 0x86, 0x33, 0x33,
- 0xbf, 0xa0, },
- { 0xcf, 0x04, 0xea, 0xf8, 0x03, 0x2a, 0x43, 0xff, 0xa6, 0x68, 0x21, 0x4c,
- 0xd5, 0x4b, 0xed, },
- { 0xaf, 0xb8, 0xbc, 0x63, 0x0f, 0x18, 0x4d, 0xe2, 0x7a, 0xdd, 0x46, 0x44,
- 0xc8, 0x24, 0x0a, 0xb7, },
- { 0x3e, 0xdc, 0x36, 0xe4, 0x89, 0xb1, 0xfa, 0xc6, 0x40, 0x93, 0x2e, 0x75,
- 0xb2, 0x15, 0xd1, 0xb1, 0x10, },
- { 0x6c, 0xd8, 0x20, 0x3b, 0x82, 0x79, 0xf9, 0xc8, 0xbc, 0x9d, 0xe0, 0x35,
- 0xbe, 0x1b, 0x49, 0x1a, 0xbc, 0x3a, },
- { 0x78, 0x65, 0x2c, 0xbe, 0x35, 0x67, 0xdc, 0x78, 0xd4, 0x41, 0xf6, 0xc9,
- 0xde, 0xde, 0x1f, 0x18, 0x13, 0x31, 0x11, },
- { 0x8a, 0x7f, 0xb1, 0x33, 0x8f, 0x0c, 0x3c, 0x0a, 0x06, 0x61, 0xf0, 0x47,
- 0x29, 0x1b, 0x29, 0xbc, 0x1c, 0x47, 0xef, 0x7a, },
- { 0x65, 0x91, 0xf1, 0xe6, 0xb3, 0x96, 0xd3, 0x8c, 0xc2, 0x4a, 0x59, 0x35,
- 0x72, 0x8e, 0x0b, 0x9a, 0x87, 0xca, 0x34, 0x7b, 0x63, },
- { 0x5f, 0x08, 0x87, 0x80, 0x56, 0x25, 0x89, 0x77, 0x61, 0x8c, 0x64, 0xa1,
- 0x59, 0x6d, 0x59, 0x62, 0xe8, 0x4a, 0xc8, 0x58, 0x99, 0xd1, },
- { 0x23, 0x87, 0x1d, 0xed, 0x6f, 0xf2, 0x91, 0x90, 0xe2, 0xfe, 0x43, 0x21,
- 0xaf, 0x97, 0xc6, 0xbc, 0xd7, 0x15, 0xc7, 0x2d, 0x08, 0x77, 0x91, },
- { 0x90, 0x47, 0x9a, 0x9e, 0x3a, 0xdf, 0xf3, 0xc9, 0x4c, 0x1e, 0xa7, 0xd4,
- 0x6a, 0x32, 0x90, 0xfe, 0xb7, 0xb6, 0x7b, 0xfa, 0x96, 0x61, 0xfb, 0xa4, },
- { 0xb1, 0x67, 0x60, 0x45, 0xb0, 0x96, 0xc5, 0x15, 0x9f, 0x4d, 0x26, 0xd7,
- 0x9d, 0xf1, 0xf5, 0x6d, 0x21, 0x00, 0x94, 0x31, 0x64, 0x94, 0xd3, 0xa7,
- 0xd3, },
- { 0x02, 0x3e, 0xaf, 0xf3, 0x79, 0x73, 0xa5, 0xf5, 0xcc, 0x7a, 0x7f, 0xfb,
- 0x79, 0x2b, 0x85, 0x8c, 0x88, 0x72, 0x06, 0xbe, 0xfe, 0xaf, 0xc1, 0x16,
- 0xa6, 0xd6, },
- { 0x2a, 0xb0, 0x1a, 0xe5, 0xaa, 0x6e, 0xb3, 0xae, 0x53, 0x85, 0x33, 0x80,
- 0x75, 0xae, 0x30, 0xe6, 0xb8, 0x72, 0x42, 0xf6, 0x25, 0x4f, 0x38, 0x88,
- 0x55, 0xd1, 0xa9, },
- { 0x90, 0xd8, 0x0c, 0xc0, 0x93, 0x4b, 0x4f, 0x9e, 0x65, 0x6c, 0xa1, 0x54,
- 0xa6, 0xf6, 0x6e, 0xca, 0xd2, 0xbb, 0x7e, 0x6a, 0x1c, 0xd3, 0xce, 0x46,
- 0xef, 0xb0, 0x00, 0x8d, },
- { 0xed, 0x9c, 0x49, 0xcd, 0xc2, 0xde, 0x38, 0x0e, 0xe9, 0x98, 0x6c, 0xc8,
- 0x90, 0x9e, 0x3c, 0xd4, 0xd3, 0xeb, 0x88, 0x32, 0xc7, 0x28, 0xe3, 0x94,
- 0x1c, 0x9f, 0x8b, 0xf3, 0xcb, },
- { 0xac, 0xe7, 0x92, 0x16, 0xb4, 0x14, 0xa0, 0xe4, 0x04, 0x79, 0xa2, 0xf4,
- 0x31, 0xe6, 0x0c, 0x26, 0xdc, 0xbf, 0x2f, 0x69, 0x1b, 0x55, 0x94, 0x67,
- 0xda, 0x0c, 0xd7, 0x32, 0x1f, 0xef, },
- { 0x68, 0x63, 0x85, 0x57, 0x95, 0x9e, 0x42, 0x27, 0x41, 0x43, 0x42, 0x02,
- 0xa5, 0x78, 0xa7, 0xc6, 0x43, 0xc1, 0x6a, 0xba, 0x70, 0x80, 0xcd, 0x04,
- 0xb6, 0x78, 0x76, 0x29, 0xf3, 0xe8, 0xa0, },
- { 0xe6, 0xac, 0x8d, 0x9d, 0xf0, 0xc0, 0xf7, 0xf7, 0xe3, 0x3e, 0x4e, 0x28,
- 0x0f, 0x59, 0xb2, 0x67, 0x9e, 0x84, 0x34, 0x42, 0x96, 0x30, 0x2b, 0xca,
- 0x49, 0xb6, 0xc5, 0x9a, 0x84, 0x59, 0xa7, 0x81, },
- { 0x7e, },
- { 0x1e, 0x21, },
- { 0x26, 0xd3, 0xdd, },
- { 0x2c, 0xd4, 0xb3, 0x3d, },
- { 0x86, 0x7b, 0x76, 0x3c, 0xf0, },
- { 0x12, 0xc3, 0x70, 0x1d, 0x55, 0x18, },
- { 0x96, 0xc2, 0xbd, 0x61, 0x55, 0xf4, 0x24, },
- { 0x20, 0x51, 0xf7, 0x86, 0x58, 0x8f, 0x07, 0x2a, },
- { 0x93, 0x15, 0xa8, 0x1d, 0xda, 0x97, 0xee, 0x0e, 0x6c, },
- { 0x39, 0x93, 0xdf, 0xd5, 0x0e, 0xca, 0xdc, 0x7a, 0x92, 0xce, },
- { 0x60, 0xd5, 0xfd, 0xf5, 0x1b, 0x26, 0x82, 0x26, 0x73, 0x02, 0xbc, },
- { 0x98, 0xf2, 0x34, 0xe1, 0xf5, 0xfb, 0x00, 0xac, 0x10, 0x4a, 0x38, 0x9f, },
- { 0xda, 0x3a, 0x92, 0x8a, 0xd0, 0xcd, 0x12, 0xcd, 0x15, 0xbb, 0xab, 0x77,
- 0x66, },
- { 0xa2, 0x92, 0x1a, 0xe5, 0xca, 0x0c, 0x30, 0x75, 0xeb, 0xaf, 0x00, 0x31,
- 0x55, 0x66, },
- { 0x06, 0xea, 0xfd, 0x3e, 0x86, 0x38, 0x62, 0x4e, 0xa9, 0x12, 0xa4, 0x12,
- 0x43, 0xbf, 0xa1, },
- { 0xe4, 0x71, 0x7b, 0x94, 0xdb, 0xa0, 0xd2, 0xff, 0x9b, 0xeb, 0xad, 0x8e,
- 0x95, 0x8a, 0xc5, 0xed, },
- { 0x25, 0x5a, 0x77, 0x71, 0x41, 0x0e, 0x7a, 0xe9, 0xed, 0x0c, 0x10, 0xef,
- 0xf6, 0x2b, 0x3a, 0xba, 0x60, },
- { 0xee, 0xe2, 0xa3, 0x67, 0x64, 0x1d, 0xc6, 0x04, 0xc4, 0xe1, 0x68, 0xd2,
- 0x6e, 0xd2, 0x91, 0x75, 0x53, 0x07, },
- { 0xe0, 0xf6, 0x4d, 0x8f, 0x68, 0xfc, 0x06, 0x7e, 0x18, 0x79, 0x7f, 0x2b,
- 0x6d, 0xef, 0x46, 0x7f, 0xab, 0xb2, 0xad, },
- { 0x3d, 0x35, 0x88, 0x9f, 0x2e, 0xcf, 0x96, 0x45, 0x07, 0x60, 0x71, 0x94,
- 0x00, 0x8d, 0xbf, 0xf4, 0xef, 0x46, 0x2e, 0x3c, },
- { 0x43, 0xcf, 0x98, 0xf7, 0x2d, 0xf4, 0x17, 0xe7, 0x8c, 0x05, 0x2d, 0x9b,
- 0x24, 0xfb, 0x4d, 0xea, 0x4a, 0xec, 0x01, 0x25, 0x29, },
- { 0x8e, 0x73, 0x9a, 0x78, 0x11, 0xfe, 0x48, 0xa0, 0x3b, 0x1a, 0x26, 0xdf,
- 0x25, 0xe9, 0x59, 0x1c, 0x70, 0x07, 0x9f, 0xdc, 0xa0, 0xa6, },
- { 0xe8, 0x47, 0x71, 0xc7, 0x3e, 0xdf, 0xb5, 0x13, 0xb9, 0x85, 0x13, 0xa8,
- 0x54, 0x47, 0x6e, 0x59, 0x96, 0x09, 0x13, 0x5f, 0x82, 0x16, 0x0b, },
- { 0xfb, 0xc0, 0x8c, 0x03, 0x21, 0xb3, 0xc4, 0xb5, 0x43, 0x32, 0x6c, 0xea,
- 0x7f, 0xa8, 0x43, 0x91, 0xe8, 0x4e, 0x3f, 0xbf, 0x45, 0x58, 0x6a, 0xa3, },
- { 0x55, 0xf8, 0xf3, 0x00, 0x76, 0x09, 0xef, 0x69, 0x5d, 0xd2, 0x8a, 0xf2,
- 0x65, 0xc3, 0xcb, 0x9b, 0x43, 0xfd, 0xb1, 0x7e, 0x7f, 0xa1, 0x94, 0xb0,
- 0xd7, },
- { 0xaa, 0x13, 0xc1, 0x51, 0x40, 0x6d, 0x8d, 0x4c, 0x0a, 0x95, 0x64, 0x7b,
- 0xd1, 0x96, 0xb6, 0x56, 0xb4, 0x5b, 0xcf, 0xd6, 0xd9, 0x15, 0x97, 0xdd,
- 0xb6, 0xef, },
- { 0xaf, 0xb7, 0x36, 0xb0, 0x04, 0xdb, 0xd7, 0x9c, 0x9a, 0x44, 0xc4, 0xf6,
- 0x1f, 0x12, 0x21, 0x2d, 0x59, 0x30, 0x54, 0xab, 0x27, 0x61, 0xa3, 0x57,
- 0xef, 0xf8, 0x53, },
- { 0x97, 0x34, 0x45, 0x3e, 0xce, 0x7c, 0x35, 0xa2, 0xda, 0x9f, 0x4b, 0x46,
- 0x6c, 0x11, 0x67, 0xff, 0x2f, 0x76, 0x58, 0x15, 0x71, 0xfa, 0x44, 0x89,
- 0x89, 0xfd, 0xf7, 0x99, },
- { 0x1f, 0xb1, 0x62, 0xeb, 0x83, 0xc5, 0x9c, 0x89, 0xf9, 0x2c, 0xd2, 0x03,
- 0x61, 0xbc, 0xbb, 0xa5, 0x74, 0x0e, 0x9b, 0x7e, 0x82, 0x3e, 0x70, 0x0a,
- 0xa9, 0x8f, 0x2b, 0x59, 0xfb, },
- { 0xf8, 0xca, 0x5e, 0x3a, 0x4f, 0x9e, 0x10, 0x69, 0x10, 0xd5, 0x4c, 0xeb,
- 0x1a, 0x0f, 0x3c, 0x6a, 0x98, 0xf5, 0xb0, 0x97, 0x5b, 0x37, 0x2f, 0x0d,
- 0xbd, 0x42, 0x4b, 0x69, 0xa1, 0x82, },
- { 0x12, 0x8c, 0x6d, 0x52, 0x08, 0xef, 0x74, 0xb2, 0xe6, 0xaa, 0xd3, 0xb0,
- 0x26, 0xb0, 0xd9, 0x94, 0xb6, 0x11, 0x45, 0x0e, 0x36, 0x71, 0x14, 0x2d,
- 0x41, 0x8c, 0x21, 0x53, 0x31, 0xe9, 0x68, },
- { 0xee, 0xea, 0x0d, 0x89, 0x47, 0x7e, 0x72, 0xd1, 0xd8, 0xce, 0x58, 0x4c,
- 0x94, 0x1f, 0x0d, 0x51, 0x08, 0xa3, 0xb6, 0x3d, 0xe7, 0x82, 0x46, 0x92,
- 0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, },
-};
-
-static bool __init noinline_for_stack blake2s_digest_test(void)
-{
- u8 key[BLAKE2S_KEY_SIZE];
- u8 buf[ARRAY_SIZE(blake2s_testvecs)];
- u8 hash[BLAKE2S_HASH_SIZE];
- struct blake2s_state state;
- bool success = true;
- int i, l;
-
- key[0] = key[1] = 1;
- for (i = 2; i < sizeof(key); ++i)
- key[i] = key[i - 2] + key[i - 1];
-
- for (i = 0; i < sizeof(buf); ++i)
- buf[i] = (u8)i;
-
- for (i = l = 0; i < ARRAY_SIZE(blake2s_testvecs); l = (l + 37) % ++i) {
- int outlen = 1 + i % BLAKE2S_HASH_SIZE;
- int keylen = (13 * i) % (BLAKE2S_KEY_SIZE + 1);
-
- blake2s(hash, buf, key + BLAKE2S_KEY_SIZE - keylen, outlen, i,
- keylen);
- if (memcmp(hash, blake2s_testvecs[i], outlen)) {
- pr_err("blake2s self-test %d: FAIL\n", i + 1);
- success = false;
- }
-
- if (!keylen)
- blake2s_init(&state, outlen);
- else
- blake2s_init_key(&state, outlen,
- key + BLAKE2S_KEY_SIZE - keylen,
- keylen);
-
- blake2s_update(&state, buf, l);
- blake2s_update(&state, buf + l, i - l);
- blake2s_final(&state, hash);
- if (memcmp(hash, blake2s_testvecs[i], outlen)) {
- pr_err("blake2s init/update/final self-test %d: FAIL\n",
- i + 1);
- success = false;
- }
- }
-
- return success;
-}
-
-static bool __init noinline_for_stack blake2s_random_test(void)
-{
- struct blake2s_state state;
- bool success = true;
- int i, l;
-
- for (i = 0; i < 32; ++i) {
- enum { TEST_ALIGNMENT = 16 };
- u8 blocks[BLAKE2S_BLOCK_SIZE * 2 + TEST_ALIGNMENT - 1]
- __aligned(TEST_ALIGNMENT);
- u8 *unaligned_block = blocks + BLAKE2S_BLOCK_SIZE;
- struct blake2s_state state1, state2;
-
- get_random_bytes(blocks, sizeof(blocks));
- get_random_bytes(&state, sizeof(state));
-
-#if defined(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) && \
- defined(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
- memcpy(&state1, &state, sizeof(state1));
- memcpy(&state2, &state, sizeof(state2));
- blake2s_compress(&state1, blocks, 2, BLAKE2S_BLOCK_SIZE);
- blake2s_compress_generic(&state2, blocks, 2, BLAKE2S_BLOCK_SIZE);
- if (memcmp(&state1, &state2, sizeof(state1))) {
- pr_err("blake2s random compress self-test %d: FAIL\n",
- i + 1);
- success = false;
- }
-#endif
-
- memcpy(&state1, &state, sizeof(state1));
- blake2s_compress(&state1, blocks, 1, BLAKE2S_BLOCK_SIZE);
- for (l = 1; l < TEST_ALIGNMENT; ++l) {
- memcpy(unaligned_block + l, blocks,
- BLAKE2S_BLOCK_SIZE);
- memcpy(&state2, &state, sizeof(state2));
- blake2s_compress(&state2, unaligned_block + l, 1,
- BLAKE2S_BLOCK_SIZE);
- if (memcmp(&state1, &state2, sizeof(state1))) {
- pr_err("blake2s random compress align %d self-test %d: FAIL\n",
- l, i + 1);
- success = false;
- }
- }
- }
-
- return success;
-}
-
-bool __init blake2s_selftest(void)
-{
- bool success;
-
- success = blake2s_digest_test();
- success &= blake2s_random_test();
-
- return success;
-}
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index b0f9a678300b..6182c21ed943 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -8,64 +8,158 @@
*
*/
-#include <crypto/internal/blake2s.h>
-#include <linux/types.h>
-#include <linux/string.h>
+#include <crypto/blake2s.h>
+#include <linux/bug.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bug.h>
+#include <linux/string.h>
+#include <linux/types.h>
-static inline void blake2s_set_lastblock(struct blake2s_state *state)
+static const u8 blake2s_sigma[10][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc)
{
- state->f[0] = -1;
+ ctx->t[0] += inc;
+ ctx->t[1] += (ctx->t[0] < inc);
}
-void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
+static void __maybe_unused
+blake2s_compress_generic(struct blake2s_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc)
{
- const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+ u32 m[16];
+ u32 v[16];
+ int i;
+
+ WARN_ON(IS_ENABLED(DEBUG) &&
+ (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
+
+ while (nblocks > 0) {
+ blake2s_increment_counter(ctx, inc);
+ memcpy(m, data, BLAKE2S_BLOCK_SIZE);
+ le32_to_cpu_array(m, ARRAY_SIZE(m));
+ memcpy(v, ctx->h, 32);
+ v[ 8] = BLAKE2S_IV0;
+ v[ 9] = BLAKE2S_IV1;
+ v[10] = BLAKE2S_IV2;
+ v[11] = BLAKE2S_IV3;
+ v[12] = BLAKE2S_IV4 ^ ctx->t[0];
+ v[13] = BLAKE2S_IV5 ^ ctx->t[1];
+ v[14] = BLAKE2S_IV6 ^ ctx->f[0];
+ v[15] = BLAKE2S_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+ a += b + m[blake2s_sigma[r][2 * i + 0]]; \
+ d = ror32(d ^ a, 16); \
+ c += d; \
+ b = ror32(b ^ c, 12); \
+ a += b + m[blake2s_sigma[r][2 * i + 1]]; \
+ d = ror32(d ^ a, 8); \
+ c += d; \
+ b = ror32(b ^ c, 7); \
+} while (0)
+
+#define ROUND(r) do { \
+ G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+ G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+ G(r, 2, v[2], v[ 6], v[10], v[14]); \
+ G(r, 3, v[3], v[ 7], v[11], v[15]); \
+ G(r, 4, v[0], v[ 5], v[10], v[15]); \
+ G(r, 5, v[1], v[ 6], v[11], v[12]); \
+ G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+ G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+
+#undef G
+#undef ROUND
+
+ for (i = 0; i < 8; ++i)
+ ctx->h[i] ^= v[i] ^ v[i + 8];
+
+ data += BLAKE2S_BLOCK_SIZE;
+ --nblocks;
+ }
+}
+
+#ifdef CONFIG_CRYPTO_LIB_BLAKE2S_ARCH
+#include "blake2s.h" /* $(SRCARCH)/blake2s.h */
+#else
+#define blake2s_compress blake2s_compress_generic
+#endif
+
+static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx)
+{
+ ctx->f[0] = -1;
+}
+
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
+{
+ const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen;
if (unlikely(!inlen))
return;
if (inlen > fill) {
- memcpy(state->buf + state->buflen, in, fill);
- blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
- state->buflen = 0;
+ memcpy(ctx->buf + ctx->buflen, in, fill);
+ blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE);
+ ctx->buflen = 0;
in += fill;
inlen -= fill;
}
if (inlen > BLAKE2S_BLOCK_SIZE) {
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
- blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+
+ blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
}
- memcpy(state->buf + state->buflen, in, inlen);
- state->buflen += inlen;
+ memcpy(ctx->buf + ctx->buflen, in, inlen);
+ ctx->buflen += inlen;
}
EXPORT_SYMBOL(blake2s_update);
-void blake2s_final(struct blake2s_state *state, u8 *out)
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out)
{
WARN_ON(IS_ENABLED(DEBUG) && !out);
- blake2s_set_lastblock(state);
- memset(state->buf + state->buflen, 0,
- BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
- blake2s_compress(state, state->buf, 1, state->buflen);
- cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
- memcpy(out, state->h, state->outlen);
- memzero_explicit(state, sizeof(*state));
+ blake2s_set_lastblock(ctx);
+ memset(ctx->buf + ctx->buflen, 0,
+ BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */
+ blake2s_compress(ctx, ctx->buf, 1, ctx->buflen);
+ cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h));
+ memcpy(out, ctx->h, ctx->outlen);
+ memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL(blake2s_final);
+#ifdef blake2s_mod_init_arch
static int __init blake2s_mod_init(void)
{
- if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) &&
- WARN_ON(!blake2s_selftest()))
- return -ENODEV;
+ blake2s_mod_init_arch();
return 0;
}
+subsys_initcall(blake2s_mod_init);
+#endif
-module_init(blake2s_mod_init);
MODULE_DESCRIPTION("BLAKE2s hash function");
MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/chacha-block-generic.c b/lib/crypto/chacha-block-generic.c
new file mode 100644
index 000000000000..77f68de71066
--- /dev/null
+++ b/lib/crypto/chacha-block-generic.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/chacha.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+static void chacha_permute(struct chacha_state *state, int nrounds)
+{
+ u32 *x = state->x;
+ int i;
+
+ /* whitelist the allowed round counts */
+ WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
+
+ for (i = 0; i < nrounds; i += 2) {
+ x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16);
+ x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16);
+ x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16);
+ x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16);
+
+ x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12);
+ x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12);
+ x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12);
+ x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12);
+
+ x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8);
+ x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8);
+ x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8);
+ x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8);
+
+ x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7);
+ x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7);
+ x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7);
+ x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7);
+
+ x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16);
+ x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16);
+ x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16);
+ x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16);
+
+ x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12);
+ x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12);
+ x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12);
+ x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12);
+
+ x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8);
+ x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8);
+ x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8);
+ x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8);
+
+ x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7);
+ x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7);
+ x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7);
+ x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7);
+ }
+}
+
+/**
+ * chacha_block_generic - generate one keystream block and increment block counter
+ * @state: input state matrix
+ * @out: output keystream block
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
+ * The caller has already converted the endianness of the input. This function
+ * also handles incrementing the block counter in the input matrix.
+ */
+void chacha_block_generic(struct chacha_state *state,
+ u8 out[CHACHA_BLOCK_SIZE], int nrounds)
+{
+ struct chacha_state permuted_state = *state;
+ int i;
+
+ chacha_permute(&permuted_state, nrounds);
+
+ for (i = 0; i < ARRAY_SIZE(state->x); i++)
+ put_unaligned_le32(permuted_state.x[i] + state->x[i],
+ &out[i * sizeof(u32)]);
+
+ state->x[12]++;
+}
+EXPORT_SYMBOL(chacha_block_generic);
+
+/**
+ * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
+ * @state: input state matrix
+ * @out: the output words
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
+ * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha
+ * skips the final addition of the initial state, and outputs only certain words
+ * of the state. It should not be used for streaming directly.
+ */
+void hchacha_block_generic(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+ struct chacha_state permuted_state = *state;
+
+ chacha_permute(&permuted_state, nrounds);
+
+ memcpy(&out[0], &permuted_state.x[0], 16);
+ memcpy(&out[4], &permuted_state.x[12], 16);
+}
+EXPORT_SYMBOL(hchacha_block_generic);
diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c
index ced87dd31a97..e0c7cb4af318 100644
--- a/lib/crypto/chacha.c
+++ b/lib/crypto/chacha.c
@@ -1,114 +1,70 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ * The ChaCha stream cipher (RFC7539)
*
* Copyright (C) 2015 Martin Willi
*/
-#include <linux/bug.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/bitops.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
+#include <crypto/algapi.h> // for crypto_xor_cpy
#include <crypto/chacha.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
-static void chacha_permute(struct chacha_state *state, int nrounds)
+static void __maybe_unused
+chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
{
- u32 *x = state->x;
- int i;
-
- /* whitelist the allowed round counts */
- WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
-
- for (i = 0; i < nrounds; i += 2) {
- x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16);
- x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16);
- x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16);
- x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16);
-
- x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12);
- x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12);
- x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12);
- x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12);
-
- x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8);
- x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8);
- x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8);
- x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8);
-
- x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7);
- x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7);
- x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7);
- x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7);
-
- x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16);
- x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16);
- x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16);
- x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16);
-
- x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12);
- x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12);
- x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12);
- x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12);
-
- x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8);
- x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8);
- x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8);
- x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8);
-
- x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7);
- x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7);
- x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7);
- x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7);
+ /* aligned to potentially speed up crypto_xor() */
+ u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
+
+ while (bytes >= CHACHA_BLOCK_SIZE) {
+ chacha_block_generic(state, stream, nrounds);
+ crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
+ bytes -= CHACHA_BLOCK_SIZE;
+ dst += CHACHA_BLOCK_SIZE;
+ src += CHACHA_BLOCK_SIZE;
+ }
+ if (bytes) {
+ chacha_block_generic(state, stream, nrounds);
+ crypto_xor_cpy(dst, src, stream, bytes);
}
}
-/**
- * chacha_block_generic - generate one keystream block and increment block counter
- * @state: input state matrix
- * @out: output keystream block
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
- *
- * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
- * The caller has already converted the endianness of the input. This function
- * also handles incrementing the block counter in the input matrix.
- */
-void chacha_block_generic(struct chacha_state *state,
- u8 out[CHACHA_BLOCK_SIZE], int nrounds)
-{
- struct chacha_state permuted_state = *state;
- int i;
-
- chacha_permute(&permuted_state, nrounds);
+#ifdef CONFIG_CRYPTO_LIB_CHACHA_ARCH
+#include "chacha.h" /* $(SRCARCH)/chacha.h */
+#else
+#define chacha_crypt_arch chacha_crypt_generic
+#define hchacha_block_arch hchacha_block_generic
+#endif
- for (i = 0; i < ARRAY_SIZE(state->x); i++)
- put_unaligned_le32(permuted_state.x[i] + state->x[i],
- &out[i * sizeof(u32)]);
-
- state->x[12]++;
+void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ chacha_crypt_arch(state, dst, src, bytes, nrounds);
}
-EXPORT_SYMBOL(chacha_block_generic);
+EXPORT_SYMBOL_GPL(chacha_crypt);
-/**
- * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
- * @state: input state matrix
- * @out: the output words
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
- *
- * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
- * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha
- * skips the final addition of the initial state, and outputs only certain words
- * of the state. It should not be used for streaming directly.
- */
-void hchacha_block_generic(const struct chacha_state *state,
- u32 out[HCHACHA_OUT_WORDS], int nrounds)
+void hchacha_block(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds)
{
- struct chacha_state permuted_state = *state;
+ hchacha_block_arch(state, out, nrounds);
+}
+EXPORT_SYMBOL_GPL(hchacha_block);
- chacha_permute(&permuted_state, nrounds);
+#ifdef chacha_mod_init_arch
+static int __init chacha_mod_init(void)
+{
+ chacha_mod_init_arch();
+ return 0;
+}
+subsys_initcall(chacha_mod_init);
- memcpy(&out[0], &permuted_state.x[0], 16);
- memcpy(&out[4], &permuted_state.x[12], 16);
+static void __exit chacha_mod_exit(void)
+{
}
-EXPORT_SYMBOL(hchacha_block_generic);
+module_exit(chacha_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/chacha20poly1305.c b/lib/crypto/chacha20poly1305.c
index e29eed49a5a1..212ce33562af 100644
--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
@@ -7,16 +7,16 @@
* Information: https://tools.ietf.org/html/rfc8439
*/
-#include <crypto/chacha20poly1305.h>
#include <crypto/chacha.h>
+#include <crypto/chacha20poly1305.h>
#include <crypto/poly1305.h>
#include <crypto/utils.h>
-
-#include <linux/unaligned.h>
-#include <linux/kernel.h>
+#include <linux/export.h>
#include <linux/init.h>
+#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/unaligned.h>
static void chacha_load_key(u32 *k, const u8 *in)
{
@@ -89,7 +89,7 @@ __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
struct chacha_state chacha_state;
u32 k[CHACHA_KEY_WORDS];
@@ -111,8 +111,8 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt);
void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 *ad, const size_t ad_len,
- const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
struct chacha_state chacha_state;
@@ -170,7 +170,7 @@ __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
struct chacha_state chacha_state;
u32 k[CHACHA_KEY_WORDS];
@@ -195,8 +195,8 @@ EXPORT_SYMBOL(chacha20poly1305_decrypt);
bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 *ad, const size_t ad_len,
- const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
struct chacha_state chacha_state;
@@ -211,7 +211,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
const size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE],
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE],
int encrypt)
{
const u8 *pad0 = page_address(ZERO_PAGE(0));
@@ -335,7 +335,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
return chacha20poly1305_crypt_sg_inplace(src, src_len, ad, ad_len,
nonce, key, 1);
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt_sg_inplace);
bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
if (unlikely(src_len < POLY1305_DIGEST_SIZE))
return false;
diff --git a/lib/crypto/curve25519-generic.c b/lib/crypto/curve25519-generic.c
deleted file mode 100644
index de7c99172fa2..000000000000
--- a/lib/crypto/curve25519-generic.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the Curve25519 ECDH algorithm, using either
- * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
- * depending on what is supported by the target compiler.
- *
- * Information: https://cr.yp.to/ecdh.html
- */
-
-#include <crypto/curve25519.h>
-#include <linux/module.h>
-
-const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
-const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
-
-EXPORT_SYMBOL(curve25519_null_point);
-EXPORT_SYMBOL(curve25519_base_point);
-EXPORT_SYMBOL(curve25519_generic);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Curve25519 scalar multiplication");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c
index 6850b76a80c9..01e265dfbcd9 100644
--- a/lib/crypto/curve25519.c
+++ b/lib/crypto/curve25519.c
@@ -2,32 +2,77 @@
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*
- * This is an implementation of the Curve25519 ECDH algorithm, using either
- * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
+ * This is an implementation of the Curve25519 ECDH algorithm, using either an
+ * architecture-optimized implementation or a generic implementation. The
+ * generic implementation is either 32-bit, or 64-bit with 128-bit integers,
* depending on what is supported by the target compiler.
*
* Information: https://cr.yp.to/ecdh.html
*/
#include <crypto/curve25519.h>
-#include <linux/module.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
#include <linux/init.h>
+#include <linux/module.h>
-static int __init curve25519_init(void)
+static const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
+static const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
+
+#ifdef CONFIG_CRYPTO_LIB_CURVE25519_ARCH
+#include "curve25519.h" /* $(SRCARCH)/curve25519.h */
+#else
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
{
- if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) &&
- WARN_ON(!curve25519_selftest()))
- return -ENODEV;
- return 0;
+ curve25519_generic(mypublic, secret, basepoint);
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ curve25519_generic(pub, secret, curve25519_base_point);
+}
+#endif
+
+bool __must_check
+curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+ curve25519_arch(mypublic, secret, basepoint);
+ return crypto_memneq(mypublic, curve25519_null_point,
+ CURVE25519_KEY_SIZE);
+}
+EXPORT_SYMBOL(curve25519);
+
+bool __must_check
+curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ if (unlikely(!crypto_memneq(secret, curve25519_null_point,
+ CURVE25519_KEY_SIZE)))
+ return false;
+ curve25519_base_arch(pub, secret);
+ return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE);
}
+EXPORT_SYMBOL(curve25519_generate_public);
-static void __exit curve25519_exit(void)
+#ifdef curve25519_mod_init_arch
+static int __init curve25519_mod_init(void)
{
+ curve25519_mod_init_arch();
+ return 0;
}
+subsys_initcall(curve25519_mod_init);
-module_init(curve25519_init);
-module_exit(curve25519_exit);
+static void __exit curve25519_mod_exit(void)
+{
+}
+module_exit(curve25519_mod_exit);
+#endif
MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Curve25519 scalar multiplication");
+MODULE_DESCRIPTION("Curve25519 algorithm");
MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/des.c b/lib/crypto/des.c
index d3423b34a8e9..a906070136dc 100644
--- a/lib/crypto/des.c
+++ b/lib/crypto/des.c
@@ -7,21 +7,20 @@
* Copyright (c) 2005 Dag Arne Osvik <da@osvik.no>
*/
+#include <crypto/des.h>
+#include <crypto/internal/des.h>
#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/crypto.h>
#include <linux/errno.h>
+#include <linux/export.h>
#include <linux/fips.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/types.h>
-
#include <linux/unaligned.h>
-#include <crypto/des.h>
-#include <crypto/internal/des.h>
-
#define ROL(x, r) ((x) = rol32((x), (r)))
#define ROR(x, r) ((x) = ror32((x), (r)))
diff --git a/lib/crypto/fips.h b/lib/crypto/fips.h
new file mode 100644
index 000000000000..023410c2e0db
--- /dev/null
+++ b/lib/crypto/fips.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: gen-fips-testvecs.py */
+
+#include <linux/fips.h>
+
+static const u8 fips_test_data[] __initconst __maybe_unused = {
+ 0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+ 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00,
+};
+
+static const u8 fips_test_key[] __initconst __maybe_unused = {
+ 0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+ 0x74, 0x20, 0x6b, 0x65, 0x79, 0x00, 0x00, 0x00,
+};
+
+static const u8 fips_test_hmac_sha1_value[] __initconst __maybe_unused = {
+ 0x29, 0xa9, 0x88, 0xb8, 0x5c, 0xb4, 0xaf, 0x4b,
+ 0x97, 0x2a, 0xee, 0x87, 0x5b, 0x0a, 0x02, 0x55,
+ 0x99, 0xbf, 0x86, 0x78,
+};
+
+static const u8 fips_test_hmac_sha256_value[] __initconst __maybe_unused = {
+ 0x59, 0x25, 0x85, 0xcc, 0x40, 0xe9, 0x64, 0x2f,
+ 0xe9, 0xbf, 0x82, 0xb7, 0xd3, 0x15, 0x3d, 0x43,
+ 0x22, 0x0b, 0x4c, 0x00, 0x90, 0x14, 0x25, 0xcf,
+ 0x9e, 0x13, 0x2b, 0xc2, 0x30, 0xe6, 0xe8, 0x93,
+};
+
+static const u8 fips_test_hmac_sha512_value[] __initconst __maybe_unused = {
+ 0x6b, 0xea, 0x5d, 0x27, 0x49, 0x5b, 0x3f, 0xea,
+ 0xde, 0x2d, 0xfa, 0x32, 0x75, 0xdb, 0x77, 0xc8,
+ 0x26, 0xe9, 0x4e, 0x95, 0x4d, 0xad, 0x88, 0x02,
+ 0x87, 0xf9, 0x52, 0x0a, 0xd1, 0x92, 0x80, 0x1d,
+ 0x92, 0x7e, 0x3c, 0xbd, 0xb1, 0x3c, 0x49, 0x98,
+ 0x44, 0x9c, 0x8f, 0xee, 0x3f, 0x02, 0x71, 0x51,
+ 0x57, 0x0b, 0x15, 0x38, 0x95, 0xd8, 0xa3, 0x81,
+ 0xba, 0xb3, 0x15, 0x37, 0x5c, 0x6d, 0x57, 0x2b,
+};
+
+static const u8 fips_test_sha3_256_value[] __initconst __maybe_unused = {
+ 0x77, 0xc4, 0x8b, 0x69, 0x70, 0x5f, 0x0a, 0xb1,
+ 0xb1, 0xa5, 0x82, 0x0a, 0x22, 0x2b, 0x49, 0x31,
+ 0xba, 0x9b, 0xb6, 0xaa, 0x32, 0xa7, 0x97, 0x00,
+ 0x98, 0xdb, 0xff, 0xe7, 0xc6, 0xde, 0xb5, 0x82,
+};
diff --git a/lib/crypto/gf128mul.c b/lib/crypto/gf128mul.c
index fbe72cb3453a..2a34590fe3f1 100644
--- a/lib/crypto/gf128mul.c
+++ b/lib/crypto/gf128mul.c
@@ -49,6 +49,7 @@
*/
#include <crypto/gf128mul.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
diff --git a/lib/crypto/hash_info.c b/lib/crypto/hash_info.c
new file mode 100644
index 000000000000..9a467638c971
--- /dev/null
+++ b/lib/crypto/hash_info.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Hash Info: Hash algorithms information
+ *
+ * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com>
+ */
+
+#include <linux/export.h>
+#include <crypto/hash_info.h>
+
+const char *const hash_algo_name[HASH_ALGO__LAST] = {
+ [HASH_ALGO_MD4] = "md4",
+ [HASH_ALGO_MD5] = "md5",
+ [HASH_ALGO_SHA1] = "sha1",
+ [HASH_ALGO_RIPE_MD_160] = "rmd160",
+ [HASH_ALGO_SHA256] = "sha256",
+ [HASH_ALGO_SHA384] = "sha384",
+ [HASH_ALGO_SHA512] = "sha512",
+ [HASH_ALGO_SHA224] = "sha224",
+ [HASH_ALGO_RIPE_MD_128] = "rmd128",
+ [HASH_ALGO_RIPE_MD_256] = "rmd256",
+ [HASH_ALGO_RIPE_MD_320] = "rmd320",
+ [HASH_ALGO_WP_256] = "wp256",
+ [HASH_ALGO_WP_384] = "wp384",
+ [HASH_ALGO_WP_512] = "wp512",
+ [HASH_ALGO_TGR_128] = "tgr128",
+ [HASH_ALGO_TGR_160] = "tgr160",
+ [HASH_ALGO_TGR_192] = "tgr192",
+ [HASH_ALGO_SM3_256] = "sm3",
+ [HASH_ALGO_STREEBOG_256] = "streebog256",
+ [HASH_ALGO_STREEBOG_512] = "streebog512",
+ [HASH_ALGO_SHA3_256] = "sha3-256",
+ [HASH_ALGO_SHA3_384] = "sha3-384",
+ [HASH_ALGO_SHA3_512] = "sha3-512",
+};
+EXPORT_SYMBOL_GPL(hash_algo_name);
+
+const int hash_digest_size[HASH_ALGO__LAST] = {
+ [HASH_ALGO_MD4] = MD5_DIGEST_SIZE,
+ [HASH_ALGO_MD5] = MD5_DIGEST_SIZE,
+ [HASH_ALGO_SHA1] = SHA1_DIGEST_SIZE,
+ [HASH_ALGO_RIPE_MD_160] = RMD160_DIGEST_SIZE,
+ [HASH_ALGO_SHA256] = SHA256_DIGEST_SIZE,
+ [HASH_ALGO_SHA384] = SHA384_DIGEST_SIZE,
+ [HASH_ALGO_SHA512] = SHA512_DIGEST_SIZE,
+ [HASH_ALGO_SHA224] = SHA224_DIGEST_SIZE,
+ [HASH_ALGO_RIPE_MD_128] = RMD128_DIGEST_SIZE,
+ [HASH_ALGO_RIPE_MD_256] = RMD256_DIGEST_SIZE,
+ [HASH_ALGO_RIPE_MD_320] = RMD320_DIGEST_SIZE,
+ [HASH_ALGO_WP_256] = WP256_DIGEST_SIZE,
+ [HASH_ALGO_WP_384] = WP384_DIGEST_SIZE,
+ [HASH_ALGO_WP_512] = WP512_DIGEST_SIZE,
+ [HASH_ALGO_TGR_128] = TGR128_DIGEST_SIZE,
+ [HASH_ALGO_TGR_160] = TGR160_DIGEST_SIZE,
+ [HASH_ALGO_TGR_192] = TGR192_DIGEST_SIZE,
+ [HASH_ALGO_SM3_256] = SM3256_DIGEST_SIZE,
+ [HASH_ALGO_STREEBOG_256] = STREEBOG256_DIGEST_SIZE,
+ [HASH_ALGO_STREEBOG_512] = STREEBOG512_DIGEST_SIZE,
+ [HASH_ALGO_SHA3_256] = SHA3_256_DIGEST_SIZE,
+ [HASH_ALGO_SHA3_384] = SHA3_384_DIGEST_SIZE,
+ [HASH_ALGO_SHA3_512] = SHA3_512_DIGEST_SIZE,
+};
+EXPORT_SYMBOL_GPL(hash_digest_size);
diff --git a/lib/crypto/libchacha.c b/lib/crypto/libchacha.c
deleted file mode 100644
index ebcca381e248..000000000000
--- a/lib/crypto/libchacha.c
+++ /dev/null
@@ -1,36 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * The ChaCha stream cipher (RFC7539)
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/module.h>
-
-#include <crypto/algapi.h> // for crypto_xor_cpy
-#include <crypto/chacha.h>
-
-void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- /* aligned to potentially speed up crypto_xor() */
- u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
-
- while (bytes >= CHACHA_BLOCK_SIZE) {
- chacha_block_generic(state, stream, nrounds);
- crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
- bytes -= CHACHA_BLOCK_SIZE;
- dst += CHACHA_BLOCK_SIZE;
- src += CHACHA_BLOCK_SIZE;
- }
- if (bytes) {
- chacha_block_generic(state, stream, nrounds);
- crypto_xor_cpy(dst, src, stream, bytes);
- }
-}
-EXPORT_SYMBOL(chacha_crypt_generic);
-
-MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)");
-MODULE_LICENSE("GPL");
diff --git a/lib/crypto/md5.c b/lib/crypto/md5.c
new file mode 100644
index 000000000000..c0610ea1370e
--- /dev/null
+++ b/lib/crypto/md5.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * MD5 and HMAC-MD5 library functions
+ *
+ * md5_block_generic() is derived from cryptoapi implementation, originally
+ * based on the public domain implementation written by Colin Plumb in 1993.
+ *
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/hmac.h>
+#include <crypto/md5.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+
+static const struct md5_block_state md5_iv = {
+ .h = { MD5_H0, MD5_H1, MD5_H2, MD5_H3 },
+};
+
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+#define MD5STEP(f, w, x, y, z, in, s) \
+ (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+static void md5_block_generic(struct md5_block_state *state,
+ const u8 data[MD5_BLOCK_SIZE])
+{
+ u32 in[MD5_BLOCK_WORDS];
+ u32 a, b, c, d;
+
+ memcpy(in, data, MD5_BLOCK_SIZE);
+ le32_to_cpu_array(in, ARRAY_SIZE(in));
+
+ a = state->h[0];
+ b = state->h[1];
+ c = state->h[2];
+ d = state->h[3];
+
+ MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+ MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+ MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+ MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+ MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+ MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+ MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+ MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+ MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+ MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+ MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+ MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+ MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+ MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+ MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+ MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+ MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+ MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+ MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+ MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+ MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+ MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+ MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+ MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+ MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+ MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+ MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+ MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+ MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+ MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+ MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+ MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+ MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+ MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+ MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+ MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+ MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+ MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+ MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+ MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+ MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+ MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+ MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+ MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+ MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+ MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+ MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+ MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+ MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+ MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+ MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+ MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+ MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+ MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+ MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+ MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+ MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+ MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+ MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+ MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+ MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+ MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+ MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+ MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+ state->h[0] += a;
+ state->h[1] += b;
+ state->h[2] += c;
+ state->h[3] += d;
+}
+
+static void __maybe_unused md5_blocks_generic(struct md5_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ md5_block_generic(state, data);
+ data += MD5_BLOCK_SIZE;
+ } while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_MD5_ARCH
+#include "md5.h" /* $(SRCARCH)/md5.h */
+#else
+#define md5_blocks md5_blocks_generic
+#endif
+
+void md5_init(struct md5_ctx *ctx)
+{
+ ctx->state = md5_iv;
+ ctx->bytecount = 0;
+}
+EXPORT_SYMBOL_GPL(md5_init);
+
+void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len)
+{
+ size_t partial = ctx->bytecount % MD5_BLOCK_SIZE;
+
+ ctx->bytecount += len;
+
+ if (partial + len >= MD5_BLOCK_SIZE) {
+ size_t nblocks;
+
+ if (partial) {
+ size_t l = MD5_BLOCK_SIZE - partial;
+
+ memcpy(&ctx->buf[partial], data, l);
+ data += l;
+ len -= l;
+
+ md5_blocks(&ctx->state, ctx->buf, 1);
+ }
+
+ nblocks = len / MD5_BLOCK_SIZE;
+ len %= MD5_BLOCK_SIZE;
+
+ if (nblocks) {
+ md5_blocks(&ctx->state, data, nblocks);
+ data += nblocks * MD5_BLOCK_SIZE;
+ }
+ partial = 0;
+ }
+ if (len)
+ memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(md5_update);
+
+static void __md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+ u64 bitcount = ctx->bytecount << 3;
+ size_t partial = ctx->bytecount % MD5_BLOCK_SIZE;
+
+ ctx->buf[partial++] = 0x80;
+ if (partial > MD5_BLOCK_SIZE - 8) {
+ memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - partial);
+ md5_blocks(&ctx->state, ctx->buf, 1);
+ partial = 0;
+ }
+ memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - 8 - partial);
+ *(__le64 *)&ctx->buf[MD5_BLOCK_SIZE - 8] = cpu_to_le64(bitcount);
+ md5_blocks(&ctx->state, ctx->buf, 1);
+
+ cpu_to_le32_array(ctx->state.h, ARRAY_SIZE(ctx->state.h));
+ memcpy(out, ctx->state.h, MD5_DIGEST_SIZE);
+}
+
+void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+ __md5_final(ctx, out);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(md5_final);
+
+void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE])
+{
+ struct md5_ctx ctx;
+
+ md5_init(&ctx);
+ md5_update(&ctx, data, len);
+ md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(md5);
+
+static void __hmac_md5_preparekey(struct md5_block_state *istate,
+ struct md5_block_state *ostate,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ union {
+ u8 b[MD5_BLOCK_SIZE];
+ unsigned long w[MD5_BLOCK_SIZE / sizeof(unsigned long)];
+ } derived_key = { 0 };
+
+ if (unlikely(raw_key_len > MD5_BLOCK_SIZE))
+ md5(raw_key, raw_key_len, derived_key.b);
+ else
+ memcpy(derived_key.b, raw_key, raw_key_len);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+ *istate = md5_iv;
+ md5_blocks(istate, derived_key.b, 1);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+ HMAC_IPAD_VALUE);
+ *ostate = md5_iv;
+ md5_blocks(ostate, derived_key.b, 1);
+
+ memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_md5_preparekey(struct hmac_md5_key *key,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_md5_preparekey(&key->istate, &key->ostate, raw_key, raw_key_len);
+}
+EXPORT_SYMBOL_GPL(hmac_md5_preparekey);
+
+void hmac_md5_init(struct hmac_md5_ctx *ctx, const struct hmac_md5_key *key)
+{
+ ctx->hash_ctx.state = key->istate;
+ ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE;
+ ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(hmac_md5_init);
+
+void hmac_md5_init_usingrawkey(struct hmac_md5_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_md5_preparekey(&ctx->hash_ctx.state, &ctx->ostate,
+ raw_key, raw_key_len);
+ ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_md5_init_usingrawkey);
+
+void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+ /* Generate the padded input for the outer hash in ctx->hash_ctx.buf. */
+ __md5_final(&ctx->hash_ctx, ctx->hash_ctx.buf);
+ memset(&ctx->hash_ctx.buf[MD5_DIGEST_SIZE], 0,
+ MD5_BLOCK_SIZE - MD5_DIGEST_SIZE);
+ ctx->hash_ctx.buf[MD5_DIGEST_SIZE] = 0x80;
+ *(__le64 *)&ctx->hash_ctx.buf[MD5_BLOCK_SIZE - 8] =
+ cpu_to_le64(8 * (MD5_BLOCK_SIZE + MD5_DIGEST_SIZE));
+
+ /* Compute the outer hash, which gives the HMAC value. */
+ md5_blocks(&ctx->ostate, ctx->hash_ctx.buf, 1);
+ cpu_to_le32_array(ctx->ostate.h, ARRAY_SIZE(ctx->ostate.h));
+ memcpy(out, ctx->ostate.h, MD5_DIGEST_SIZE);
+
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(hmac_md5_final);
+
+void hmac_md5(const struct hmac_md5_key *key,
+ const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE])
+{
+ struct hmac_md5_ctx ctx;
+
+ hmac_md5_init(&ctx, key);
+ hmac_md5_update(&ctx, data, data_len);
+ hmac_md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_md5);
+
+void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[MD5_DIGEST_SIZE])
+{
+ struct hmac_md5_ctx ctx;
+
+ hmac_md5_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_md5_update(&ctx, data, data_len);
+ hmac_md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_md5_usingrawkey);
+
+#ifdef md5_mod_init_arch
+static int __init md5_mod_init(void)
+{
+ md5_mod_init_arch();
+ return 0;
+}
+subsys_initcall(md5_mod_init);
+
+static void __exit md5_mod_exit(void)
+{
+}
+module_exit(md5_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("MD5 and HMAC-MD5 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/memneq.c b/lib/crypto/memneq.c
index a2afd10349c9..44daacb8cb51 100644
--- a/lib/crypto/memneq.c
+++ b/lib/crypto/memneq.c
@@ -59,9 +59,10 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include <linux/unaligned.h>
#include <crypto/algapi.h>
+#include <linux/export.h>
#include <linux/module.h>
+#include <linux/unaligned.h>
/* Generic path for arbitrary size */
static inline unsigned long
diff --git a/lib/crypto/mips/.gitignore b/lib/crypto/mips/.gitignore
new file mode 100644
index 000000000000..0d47d4f21c6d
--- /dev/null
+++ b/lib/crypto/mips/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
diff --git a/lib/crypto/mips/chacha-core.S b/lib/crypto/mips/chacha-core.S
new file mode 100644
index 000000000000..706aeb850fb0
--- /dev/null
+++ b/lib/crypto/mips/chacha-core.S
@@ -0,0 +1,491 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32 0x3c
+#define CHACHA20_BLOCK_SIZE 64
+#define STACK_SIZE 32
+
+#define X0 $t0
+#define X1 $t1
+#define X2 $t2
+#define X3 $t3
+#define X4 $t4
+#define X5 $t5
+#define X6 $t6
+#define X7 $t7
+#define X8 $t8
+#define X9 $t9
+#define X10 $v1
+#define X11 $s6
+#define X12 $s5
+#define X13 $s4
+#define X14 $s3
+#define X15 $s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0 $s1
+#define T1 $s0
+#define T(n) T ## n
+#define X(n) X ## n
+
+/* Input arguments */
+#define STATE $a0
+#define OUT $a1
+#define IN $a2
+#define BYTES $a3
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0 $v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X X15
+#define SAVED_CA $s7
+
+#define IS_UNALIGNED $s7
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define CPU_TO_LE32(n) \
+ wsbh n, n; \
+ rotr n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define CPU_TO_LE32(n)
+#endif
+
+#define FOR_EACH_WORD(x) \
+ x( 0); \
+ x( 1); \
+ x( 2); \
+ x( 3); \
+ x( 4); \
+ x( 5); \
+ x( 6); \
+ x( 7); \
+ x( 8); \
+ x( 9); \
+ x(10); \
+ x(11); \
+ x(12); \
+ x(13); \
+ x(14); \
+ x(15);
+
+#define FOR_EACH_WORD_REV(x) \
+ x(15); \
+ x(14); \
+ x(13); \
+ x(12); \
+ x(11); \
+ x(10); \
+ x( 9); \
+ x( 8); \
+ x( 7); \
+ x( 6); \
+ x( 5); \
+ x( 4); \
+ x( 3); \
+ x( 2); \
+ x( 1); \
+ x( 0);
+
+#define PLUS_ONE_0 1
+#define PLUS_ONE_1 2
+#define PLUS_ONE_2 3
+#define PLUS_ONE_3 4
+#define PLUS_ONE_4 5
+#define PLUS_ONE_5 6
+#define PLUS_ONE_6 7
+#define PLUS_ONE_7 8
+#define PLUS_ONE_8 9
+#define PLUS_ONE_9 10
+#define PLUS_ONE_10 11
+#define PLUS_ONE_11 12
+#define PLUS_ONE_12 13
+#define PLUS_ONE_13 14
+#define PLUS_ONE_14 15
+#define PLUS_ONE_15 16
+#define PLUS_ONE(x) PLUS_ONE_ ## x
+#define _CONCAT3(a,b,c) a ## b ## c
+#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
+
+#define STORE_UNALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
+ .if (x != 12); \
+ lw T0, (x*4)(STATE); \
+ .endif; \
+ lwl T1, (x*4)+MSB ## (IN); \
+ lwr T1, (x*4)+LSB ## (IN); \
+ .if (x == 12); \
+ addu X ## x, NONCE_0; \
+ .else; \
+ addu X ## x, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## x); \
+ xor X ## x, T1; \
+ swl X ## x, (x*4)+MSB ## (OUT); \
+ swr X ## x, (x*4)+LSB ## (OUT);
+
+#define STORE_ALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
+ .if (x != 12); \
+ lw T0, (x*4)(STATE); \
+ .endif; \
+ lw T1, (x*4) ## (IN); \
+ .if (x == 12); \
+ addu X ## x, NONCE_0; \
+ .else; \
+ addu X ## x, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## x); \
+ xor X ## x, T1; \
+ sw X ## x, (x*4) ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x) \
+.Lchacha_mips_jmptbl_aligned_ ## x: ; \
+ .set noreorder; \
+ b .Lchacha_mips_xor_aligned_ ## x ## _b; \
+ .if (x == 12); \
+ addu SAVED_X, X ## x, NONCE_0; \
+ .else; \
+ addu SAVED_X, X ## x, SAVED_CA; \
+ .endif; \
+ .set reorder
+
+#define JMPTBL_UNALIGNED(x) \
+.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
+ .set noreorder; \
+ b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
+ .if (x == 12); \
+ addu SAVED_X, X ## x, NONCE_0; \
+ .else; \
+ addu SAVED_X, X ## x, SAVED_CA; \
+ .endif; \
+ .set reorder
+
+#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
+ addu X(A), X(K); \
+ addu X(B), X(L); \
+ addu X(C), X(M); \
+ addu X(D), X(N); \
+ xor X(V), X(A); \
+ xor X(W), X(B); \
+ xor X(Y), X(C); \
+ xor X(Z), X(D); \
+ rotr X(V), 32 - S; \
+ rotr X(W), 32 - S; \
+ rotr X(Y), 32 - S; \
+ rotr X(Z), 32 - S;
+
+.text
+.set reorder
+.set noat
+.globl chacha_crypt_arch
+.ent chacha_crypt_arch
+chacha_crypt_arch:
+ .frame $sp, STACK_SIZE, $ra
+
+ /* Load number of rounds */
+ lw $at, 16($sp)
+
+ addiu $sp, -STACK_SIZE
+
+ /* Return bytes = 0. */
+ beqz BYTES, .Lchacha_mips_end
+
+ lw NONCE_0, 48(STATE)
+
+ /* Save s0-s7 */
+ sw $s0, 0($sp)
+ sw $s1, 4($sp)
+ sw $s2, 8($sp)
+ sw $s3, 12($sp)
+ sw $s4, 16($sp)
+ sw $s5, 20($sp)
+ sw $s6, 24($sp)
+ sw $s7, 28($sp)
+
+ /* Test IN or OUT is unaligned.
+ * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
+ */
+ or IS_UNALIGNED, IN, OUT
+ andi IS_UNALIGNED, 0x3
+
+ b .Lchacha_rounds_start
+
+.align 4
+.Loop_chacha_rounds:
+ addiu IN, CHACHA20_BLOCK_SIZE
+ addiu OUT, CHACHA20_BLOCK_SIZE
+ addiu NONCE_0, 1
+
+.Lchacha_rounds_start:
+ lw X0, 0(STATE)
+ lw X1, 4(STATE)
+ lw X2, 8(STATE)
+ lw X3, 12(STATE)
+
+ lw X4, 16(STATE)
+ lw X5, 20(STATE)
+ lw X6, 24(STATE)
+ lw X7, 28(STATE)
+ lw X8, 32(STATE)
+ lw X9, 36(STATE)
+ lw X10, 40(STATE)
+ lw X11, 44(STATE)
+
+ move X12, NONCE_0
+ lw X13, 52(STATE)
+ lw X14, 56(STATE)
+ lw X15, 60(STATE)
+
+.Loop_chacha_xor_rounds:
+ addiu $at, -2
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
+ bnez $at, .Loop_chacha_xor_rounds
+
+ addiu BYTES, -(CHACHA20_BLOCK_SIZE)
+
+ /* Is data src/dst unaligned? Jump */
+ bnez IS_UNALIGNED, .Loop_chacha_unaligned
+
+ /* Set number rounds here to fill delayslot. */
+ lw $at, (STACK_SIZE+16)($sp)
+
+ /* BYTES < 0, it has no full block. */
+ bltz BYTES, .Lchacha_mips_no_full_block_aligned
+
+ FOR_EACH_WORD_REV(STORE_ALIGNED)
+
+ /* BYTES > 0? Loop again. */
+ bgtz BYTES, .Loop_chacha_rounds
+
+ /* Place this here to fill delay slot */
+ addiu NONCE_0, 1
+
+ /* BYTES < 0? Handle last bytes */
+ bltz BYTES, .Lchacha_mips_xor_bytes
+
+.Lchacha_mips_xor_done:
+ /* Restore used registers */
+ lw $s0, 0($sp)
+ lw $s1, 4($sp)
+ lw $s2, 8($sp)
+ lw $s3, 12($sp)
+ lw $s4, 16($sp)
+ lw $s5, 20($sp)
+ lw $s6, 24($sp)
+ lw $s7, 28($sp)
+
+ /* Write NONCE_0 back to right location in state */
+ sw NONCE_0, 48(STATE)
+
+.Lchacha_mips_end:
+ addiu $sp, STACK_SIZE
+ jr $ra
+
+.Lchacha_mips_no_full_block_aligned:
+ /* Restore the offset on BYTES */
+ addiu BYTES, CHACHA20_BLOCK_SIZE
+
+ /* Get number of full WORDS */
+ andi $at, BYTES, MASK_U32
+
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
+
+ /* Calculate lower half jump table offset */
+ ins T0, $at, 1, 6
+
+ /* Add offset to STATE */
+ addu T1, STATE, $at
+
+ /* Add lower half jump table addr */
+ addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
+
+ /* Read value from STATE */
+ lw SAVED_CA, 0(T1)
+
+ /* Store remaining bytecounter as negative value */
+ subu BYTES, $at, BYTES
+
+ jr T0
+
+ /* Jump table */
+ FOR_EACH_WORD(JMPTBL_ALIGNED)
+
+
+.Loop_chacha_unaligned:
+ /* Set number rounds here to fill delayslot. */
+ lw $at, (STACK_SIZE+16)($sp)
+
+ /* BYTES > 0, it has no full block. */
+ bltz BYTES, .Lchacha_mips_no_full_block_unaligned
+
+ FOR_EACH_WORD_REV(STORE_UNALIGNED)
+
+ /* BYTES > 0? Loop again. */
+ bgtz BYTES, .Loop_chacha_rounds
+
+ /* Write NONCE_0 back to right location in state */
+ sw NONCE_0, 48(STATE)
+
+ .set noreorder
+ /* Fall through to byte handling */
+ bgez BYTES, .Lchacha_mips_xor_done
+.Lchacha_mips_xor_unaligned_0_b:
+.Lchacha_mips_xor_aligned_0_b:
+ /* Place this here to fill delay slot */
+ addiu NONCE_0, 1
+ .set reorder
+
+.Lchacha_mips_xor_bytes:
+ addu IN, $at
+ addu OUT, $at
+ /* First byte */
+ lbu T1, 0(IN)
+ addiu $at, BYTES, 1
+ xor T1, SAVED_X
+ sb T1, 0(OUT)
+ beqz $at, .Lchacha_mips_xor_done
+ /* Second byte */
+ lbu T1, 1(IN)
+ addiu $at, BYTES, 2
+ rotr SAVED_X, 8
+ xor T1, SAVED_X
+ sb T1, 1(OUT)
+ beqz $at, .Lchacha_mips_xor_done
+ /* Third byte */
+ lbu T1, 2(IN)
+ rotr SAVED_X, 8
+ xor T1, SAVED_X
+ sb T1, 2(OUT)
+ b .Lchacha_mips_xor_done
+
+.Lchacha_mips_no_full_block_unaligned:
+ /* Restore the offset on BYTES */
+ addiu BYTES, CHACHA20_BLOCK_SIZE
+
+ /* Get number of full WORDS */
+ andi $at, BYTES, MASK_U32
+
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
+
+ /* Calculate lower half jump table offset */
+ ins T0, $at, 1, 6
+
+ /* Add offset to STATE */
+ addu T1, STATE, $at
+
+ /* Add lower half jump table addr */
+ addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
+
+ /* Read value from STATE */
+ lw SAVED_CA, 0(T1)
+
+ /* Store remaining bytecounter as negative value */
+ subu BYTES, $at, BYTES
+
+ jr T0
+
+ /* Jump table */
+ FOR_EACH_WORD(JMPTBL_UNALIGNED)
+.end chacha_crypt_arch
+.set at
+
+/* Input arguments
+ * STATE $a0
+ * OUT $a1
+ * NROUND $a2
+ */
+
+#undef X12
+#undef X13
+#undef X14
+#undef X15
+
+#define X12 $a3
+#define X13 $at
+#define X14 $v0
+#define X15 STATE
+
+.set noat
+.globl hchacha_block_arch
+.ent hchacha_block_arch
+hchacha_block_arch:
+ .frame $sp, STACK_SIZE, $ra
+
+ addiu $sp, -STACK_SIZE
+
+ /* Save X11(s6) */
+ sw X11, 0($sp)
+
+ lw X0, 0(STATE)
+ lw X1, 4(STATE)
+ lw X2, 8(STATE)
+ lw X3, 12(STATE)
+ lw X4, 16(STATE)
+ lw X5, 20(STATE)
+ lw X6, 24(STATE)
+ lw X7, 28(STATE)
+ lw X8, 32(STATE)
+ lw X9, 36(STATE)
+ lw X10, 40(STATE)
+ lw X11, 44(STATE)
+ lw X12, 48(STATE)
+ lw X13, 52(STATE)
+ lw X14, 56(STATE)
+ lw X15, 60(STATE)
+
+.Loop_hchacha_xor_rounds:
+ addiu $a2, -2
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
+ bnez $a2, .Loop_hchacha_xor_rounds
+
+ /* Restore used register */
+ lw X11, 0($sp)
+
+ sw X0, 0(OUT)
+ sw X1, 4(OUT)
+ sw X2, 8(OUT)
+ sw X3, 12(OUT)
+ sw X12, 16(OUT)
+ sw X13, 20(OUT)
+ sw X14, 24(OUT)
+ sw X15, 28(OUT)
+
+ addiu $sp, STACK_SIZE
+ jr $ra
+.end hchacha_block_arch
+.set at
diff --git a/lib/crypto/mips/chacha.h b/lib/crypto/mips/chacha.h
new file mode 100644
index 000000000000..0c18c0dc2a40
--- /dev/null
+++ b/lib/crypto/mips/chacha.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ChaCha and HChaCha functions (MIPS optimized)
+ *
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/kernel.h>
+
+asmlinkage void chacha_crypt_arch(struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds);
+asmlinkage void hchacha_block_arch(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds);
diff --git a/lib/crypto/mips/md5.h b/lib/crypto/mips/md5.h
new file mode 100644
index 000000000000..e08e28aeffa4
--- /dev/null
+++ b/lib/crypto/mips/md5.h
@@ -0,0 +1,65 @@
+/*
+ * Cryptographic API.
+ *
+ * MD5 Message Digest Algorithm (RFC1321).
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/md5.c, which is:
+ *
+ * Derived from cryptoapi implementation, originally based on the
+ * public domain implementation written by Colin Plumb in 1993.
+ *
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void md5_blocks(struct md5_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ struct octeon_cop2_state cop2_state;
+ u64 *state64 = (u64 *)state;
+ unsigned long flags;
+
+ if (!octeon_has_crypto())
+ return md5_blocks_generic(state, data, nblocks);
+
+ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+
+ flags = octeon_crypto_enable(&cop2_state);
+ write_octeon_64bit_hash_dword(state64[0], 0);
+ write_octeon_64bit_hash_dword(state64[1], 1);
+
+ do {
+ const u64 *block = (const u64 *)data;
+
+ write_octeon_64bit_block_dword(block[0], 0);
+ write_octeon_64bit_block_dword(block[1], 1);
+ write_octeon_64bit_block_dword(block[2], 2);
+ write_octeon_64bit_block_dword(block[3], 3);
+ write_octeon_64bit_block_dword(block[4], 4);
+ write_octeon_64bit_block_dword(block[5], 5);
+ write_octeon_64bit_block_dword(block[6], 6);
+ octeon_md5_start(block[7]);
+
+ data += MD5_BLOCK_SIZE;
+ } while (--nblocks);
+
+ state64[0] = read_octeon_64bit_hash_dword(0);
+ state64[1] = read_octeon_64bit_hash_dword(1);
+ octeon_crypto_disable(&cop2_state, flags);
+
+ le32_to_cpu_array(state->h, ARRAY_SIZE(state->h));
+}
diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl
new file mode 100644
index 000000000000..71347f34f4f9
--- /dev/null
+++ b/lib/crypto/mips/poly1305-mips.pl
@@ -0,0 +1,1269 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
+# project.
+# ====================================================================
+
+# Poly1305 hash for MIPS.
+#
+# May 2016
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc
+# R1x000 ~5.5/+130% (big-endian)
+# Octeon II 2.50/+70% (little-endian)
+#
+# March 2019
+#
+# Add 32-bit code path.
+#
+# October 2019
+#
+# Modulo-scheduling reduction allows to omit dependency chain at the
+# end of inner loop and improve performance. Also optimize MIPS32R2
+# code path for MIPS 1004K core. Per René von Dorst's suggestions.
+#
+# IALU/gcc
+# R1x000 ~9.8/? (big-endian)
+# Octeon II 3.65/+140% (little-endian)
+# MT7621/1004K 4.75/? (little-endian)
+#
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+# excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
+
+if ($flavour =~ /64|n32/i) {{{
+######################################################################
+# 64-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
+ defined(_MIPS_ARCH_MIPS64R6)) \\
+ && !defined(_MIPS_ARCH_MIPS64R2)
+# define _MIPS_ARCH_MIPS64R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+# define dmultu(rs,rt)
+# define mflo(rd,rs,rt) dmulu rd,rs,rt
+# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
+#else
+# define dmultu(rs,rt) dmultu rs,rt
+# define mflo(rd,rs,rt) mflo rd
+# define mfhi(rd,rs,rt) mfhi rd
+#endif
+
+#ifdef __KERNEL__
+# define poly1305_init poly1305_block_init
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 7
+#else
+# define MSB 7
+# define LSB 0
+#endif
+
+.text
+.set noat
+.set noreorder
+
+.align 5
+.globl poly1305_init
+.ent poly1305_init
+poly1305_init:
+ .frame $sp,0,$ra
+ .set reorder
+
+ sd $zero,0($ctx)
+ sd $zero,8($ctx)
+ sd $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+ andi $tmp0,$inp,7 # $inp % 8
+ dsubu $inp,$inp,$tmp0 # align $inp
+ sll $tmp0,$tmp0,3 # byte to bit offset
+ ld $in0,0($inp)
+ ld $in1,8($inp)
+ beqz $tmp0,.Laligned_key
+ ld $tmp2,16($inp)
+
+ subu $tmp1,$zero,$tmp0
+# ifdef MIPSEB
+ dsllv $in0,$in0,$tmp0
+ dsrlv $tmp3,$in1,$tmp1
+ dsllv $in1,$in1,$tmp0
+ dsrlv $tmp2,$tmp2,$tmp1
+# else
+ dsrlv $in0,$in0,$tmp0
+ dsllv $tmp3,$in1,$tmp1
+ dsrlv $in1,$in1,$tmp0
+ dsllv $tmp2,$tmp2,$tmp1
+# endif
+ or $in0,$in0,$tmp3
+ or $in1,$in1,$tmp2
+.Laligned_key:
+#else
+ ldl $in0,0+MSB($inp)
+ ldl $in1,8+MSB($inp)
+ ldr $in0,0+LSB($inp)
+ ldr $in1,8+LSB($inp)
+#endif
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+ dsbh $in0,$in0 # byte swap
+ dsbh $in1,$in1
+ dshd $in0,$in0
+ dshd $in1,$in1
+# else
+ ori $tmp0,$zero,0xFF
+ dsll $tmp2,$tmp0,32
+ or $tmp0,$tmp2 # 0x000000FF000000FF
+
+ and $tmp1,$in0,$tmp0 # byte swap
+ and $tmp3,$in1,$tmp0
+ dsrl $tmp2,$in0,24
+ dsrl $tmp4,$in1,24
+ dsll $tmp1,24
+ dsll $tmp3,24
+ and $tmp2,$tmp0
+ and $tmp4,$tmp0
+ dsll $tmp0,8 # 0x0000FF000000FF00
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ and $tmp2,$in0,$tmp0
+ and $tmp4,$in1,$tmp0
+ dsrl $in0,8
+ dsrl $in1,8
+ dsll $tmp2,8
+ dsll $tmp4,8
+ and $in0,$tmp0
+ and $in1,$tmp0
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ or $in0,$tmp1
+ or $in1,$tmp3
+ dsrl $tmp1,$in0,32
+ dsrl $tmp3,$in1,32
+ dsll $in0,32
+ dsll $in1,32
+ or $in0,$tmp1
+ or $in1,$tmp3
+# endif
+#endif
+ li $tmp0,1
+ dsll $tmp0,32 # 0x0000000100000000
+ daddiu $tmp0,-63 # 0x00000000ffffffc1
+ dsll $tmp0,28 # 0x0ffffffc10000000
+ daddiu $tmp0,-1 # 0x0ffffffc0fffffff
+
+ and $in0,$tmp0
+ daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
+ and $in1,$tmp0
+
+ sd $in0,24($ctx)
+ dsrl $tmp0,$in1,2
+ sd $in1,32($ctx)
+ daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
+ sd $tmp0,40($ctx)
+
+.Lno_key:
+ li $v0,0 # return 0
+ jr $ra
+.end poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
+
+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
+ ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
+my ($shr,$shl) = ($s6,$s7); # used on R6
+
+$code.=<<___;
+.align 5
+.globl poly1305_blocks
+.ent poly1305_blocks
+poly1305_blocks:
+ .set noreorder
+ dsrl $len,4 # number of complete blocks
+ bnez $len,poly1305_blocks_internal
+ nop
+ jr $ra
+ nop
+.end poly1305_blocks
+
+.align 5
+.ent poly1305_blocks_internal
+poly1305_blocks_internal:
+ .set noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+ .frame $sp,8*8,$ra
+ .mask $SAVED_REGS_MASK|0x000c0000,-8
+ dsubu $sp,8*8
+ sd $s7,56($sp)
+ sd $s6,48($sp)
+#else
+ .frame $sp,6*8,$ra
+ .mask $SAVED_REGS_MASK,-8
+ dsubu $sp,6*8
+#endif
+ sd $s5,40($sp)
+ sd $s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ sd $s3,24($sp)
+ sd $s2,16($sp)
+ sd $s1,8($sp)
+ sd $s0,0($sp)
+___
+$code.=<<___;
+ .set reorder
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+ andi $shr,$inp,7
+ dsubu $inp,$inp,$shr # align $inp
+ sll $shr,$shr,3 # byte to bit offset
+ subu $shl,$zero,$shr
+#endif
+
+ ld $h0,0($ctx) # load hash value
+ ld $h1,8($ctx)
+ ld $h2,16($ctx)
+
+ ld $r0,24($ctx) # load key
+ ld $r1,32($ctx)
+ ld $rs1,40($ctx)
+
+ dsll $len,4
+ daddu $len,$inp # end of buffer
+ b .Loop
+
+.align 4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS64R6)
+ ld $in0,0($inp) # load input
+ ld $in1,8($inp)
+ beqz $shr,.Laligned_inp
+
+ ld $tmp2,16($inp)
+# ifdef MIPSEB
+ dsllv $in0,$in0,$shr
+ dsrlv $tmp3,$in1,$shl
+ dsllv $in1,$in1,$shr
+ dsrlv $tmp2,$tmp2,$shl
+# else
+ dsrlv $in0,$in0,$shr
+ dsllv $tmp3,$in1,$shl
+ dsrlv $in1,$in1,$shr
+ dsllv $tmp2,$tmp2,$shl
+# endif
+ or $in0,$in0,$tmp3
+ or $in1,$in1,$tmp2
+.Laligned_inp:
+#else
+ ldl $in0,0+MSB($inp) # load input
+ ldl $in1,8+MSB($inp)
+ ldr $in0,0+LSB($inp)
+ ldr $in1,8+LSB($inp)
+#endif
+ daddiu $inp,16
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+ dsbh $in0,$in0 # byte swap
+ dsbh $in1,$in1
+ dshd $in0,$in0
+ dshd $in1,$in1
+# else
+ ori $tmp0,$zero,0xFF
+ dsll $tmp2,$tmp0,32
+ or $tmp0,$tmp2 # 0x000000FF000000FF
+
+ and $tmp1,$in0,$tmp0 # byte swap
+ and $tmp3,$in1,$tmp0
+ dsrl $tmp2,$in0,24
+ dsrl $tmp4,$in1,24
+ dsll $tmp1,24
+ dsll $tmp3,24
+ and $tmp2,$tmp0
+ and $tmp4,$tmp0
+ dsll $tmp0,8 # 0x0000FF000000FF00
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ and $tmp2,$in0,$tmp0
+ and $tmp4,$in1,$tmp0
+ dsrl $in0,8
+ dsrl $in1,8
+ dsll $tmp2,8
+ dsll $tmp4,8
+ and $in0,$tmp0
+ and $in1,$tmp0
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ or $in0,$tmp1
+ or $in1,$tmp3
+ dsrl $tmp1,$in0,32
+ dsrl $tmp3,$in1,32
+ dsll $in0,32
+ dsll $in1,32
+ or $in0,$tmp1
+ or $in1,$tmp3
+# endif
+#endif
+ dsrl $tmp1,$h2,2 # modulo-scheduled reduction
+ andi $h2,$h2,3
+ dsll $tmp0,$tmp1,2
+
+ daddu $d0,$h0,$in0 # accumulate input
+ daddu $tmp1,$tmp0
+ sltu $tmp0,$d0,$h0
+ daddu $d0,$d0,$tmp1 # ... and residue
+ sltu $tmp1,$d0,$tmp1
+ daddu $d1,$h1,$in1
+ daddu $tmp0,$tmp1
+ sltu $tmp1,$d1,$h1
+ daddu $d1,$tmp0
+
+ dmultu ($r0,$d0) # h0*r0
+ daddu $d2,$h2,$padbit
+ sltu $tmp0,$d1,$tmp0
+ mflo ($h0,$r0,$d0)
+ mfhi ($h1,$r0,$d0)
+
+ dmultu ($rs1,$d1) # h1*5*r1
+ daddu $d2,$tmp1
+ daddu $d2,$tmp0
+ mflo ($tmp0,$rs1,$d1)
+ mfhi ($tmp1,$rs1,$d1)
+
+ dmultu ($r1,$d0) # h0*r1
+ mflo ($tmp2,$r1,$d0)
+ mfhi ($h2,$r1,$d0)
+ daddu $h0,$tmp0
+ daddu $h1,$tmp1
+ sltu $tmp0,$h0,$tmp0
+
+ dmultu ($r0,$d1) # h1*r0
+ daddu $h1,$tmp0
+ daddu $h1,$tmp2
+ mflo ($tmp0,$r0,$d1)
+ mfhi ($tmp1,$r0,$d1)
+
+ dmultu ($rs1,$d2) # h2*5*r1
+ sltu $tmp2,$h1,$tmp2
+ daddu $h2,$tmp2
+ mflo ($tmp2,$rs1,$d2)
+
+ dmultu ($r0,$d2) # h2*r0
+ daddu $h1,$tmp0
+ daddu $h2,$tmp1
+ mflo ($tmp3,$r0,$d2)
+ sltu $tmp0,$h1,$tmp0
+ daddu $h2,$tmp0
+
+ daddu $h1,$tmp2
+ sltu $tmp2,$h1,$tmp2
+ daddu $h2,$tmp2
+ daddu $h2,$tmp3
+
+ bne $inp,$len,.Loop
+
+ sd $h0,0($ctx) # store hash value
+ sd $h1,8($ctx)
+ sd $h2,16($ctx)
+
+ .set noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+ ld $s7,56($sp)
+ ld $s6,48($sp)
+#endif
+ ld $s5,40($sp) # epilogue
+ ld $s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
+ ld $s3,24($sp)
+ ld $s2,16($sp)
+ ld $s1,8($sp)
+ ld $s0,0($sp)
+___
+$code.=<<___;
+ jr $ra
+#if defined(_MIPS_ARCH_MIPS64R6)
+ daddu $sp,8*8
+#else
+ daddu $sp,6*8
+#endif
+.end poly1305_blocks_internal
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.align 5
+.globl poly1305_emit
+.ent poly1305_emit
+poly1305_emit:
+ .frame $sp,0,$ra
+ .set reorder
+
+ ld $tmp2,16($ctx)
+ ld $tmp0,0($ctx)
+ ld $tmp1,8($ctx)
+
+ li $in0,-4 # final reduction
+ dsrl $in1,$tmp2,2
+ and $in0,$tmp2
+ andi $tmp2,$tmp2,3
+ daddu $in0,$in1
+
+ daddu $tmp0,$tmp0,$in0
+ sltu $in1,$tmp0,$in0
+ daddiu $in0,$tmp0,5 # compare to modulus
+ daddu $tmp1,$tmp1,$in1
+ sltiu $tmp3,$in0,5
+ sltu $tmp4,$tmp1,$in1
+ daddu $in1,$tmp1,$tmp3
+ daddu $tmp2,$tmp2,$tmp4
+ sltu $tmp3,$in1,$tmp3
+ daddu $tmp2,$tmp2,$tmp3
+
+ dsrl $tmp2,2 # see if it carried/borrowed
+ dsubu $tmp2,$zero,$tmp2
+
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+ and $in0,$tmp2
+ and $in1,$tmp2
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+
+ lwu $tmp0,0($nonce) # load nonce
+ lwu $tmp1,4($nonce)
+ lwu $tmp2,8($nonce)
+ lwu $tmp3,12($nonce)
+ dsll $tmp1,32
+ dsll $tmp3,32
+ or $tmp0,$tmp1
+ or $tmp2,$tmp3
+
+ daddu $in0,$tmp0 # accumulate nonce
+ daddu $in1,$tmp2
+ sltu $tmp0,$in0,$tmp0
+ daddu $in1,$tmp0
+
+ dsrl $tmp0,$in0,8 # write mac value
+ dsrl $tmp1,$in0,16
+ dsrl $tmp2,$in0,24
+ sb $in0,0($mac)
+ dsrl $tmp3,$in0,32
+ sb $tmp0,1($mac)
+ dsrl $tmp0,$in0,40
+ sb $tmp1,2($mac)
+ dsrl $tmp1,$in0,48
+ sb $tmp2,3($mac)
+ dsrl $tmp2,$in0,56
+ sb $tmp3,4($mac)
+ dsrl $tmp3,$in1,8
+ sb $tmp0,5($mac)
+ dsrl $tmp0,$in1,16
+ sb $tmp1,6($mac)
+ dsrl $tmp1,$in1,24
+ sb $tmp2,7($mac)
+
+ sb $in1,8($mac)
+ dsrl $tmp2,$in1,32
+ sb $tmp3,9($mac)
+ dsrl $tmp3,$in1,40
+ sb $tmp0,10($mac)
+ dsrl $tmp0,$in1,48
+ sb $tmp1,11($mac)
+ dsrl $tmp1,$in1,56
+ sb $tmp2,12($mac)
+ sb $tmp3,13($mac)
+ sb $tmp0,14($mac)
+ sb $tmp1,15($mac)
+
+ jr $ra
+.end poly1305_emit
+.rdata
+.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
+.align 2
+___
+}
+}}} else {{{
+######################################################################
+# 32-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
+ ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
+ defined(_MIPS_ARCH_MIPS32R6)) \\
+ && !defined(_MIPS_ARCH_MIPS32R2)
+# define _MIPS_ARCH_MIPS32R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+# define multu(rs,rt)
+# define mflo(rd,rs,rt) mulu rd,rs,rt
+# define mfhi(rd,rs,rt) muhu rd,rs,rt
+#else
+# define multu(rs,rt) multu rs,rt
+# define mflo(rd,rs,rt) mflo rd
+# define mfhi(rd,rs,rt) mfhi rd
+#endif
+
+#ifdef __KERNEL__
+# define poly1305_init poly1305_block_init
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 3
+#else
+# define MSB 3
+# define LSB 0
+#endif
+
+.text
+.set noat
+.set noreorder
+
+.align 5
+.globl poly1305_init
+.ent poly1305_init
+poly1305_init:
+ .frame $sp,0,$ra
+ .set reorder
+
+ sw $zero,0($ctx)
+ sw $zero,4($ctx)
+ sw $zero,8($ctx)
+ sw $zero,12($ctx)
+ sw $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+ andi $tmp0,$inp,3 # $inp % 4
+ subu $inp,$inp,$tmp0 # align $inp
+ sll $tmp0,$tmp0,3 # byte to bit offset
+ lw $in0,0($inp)
+ lw $in1,4($inp)
+ lw $in2,8($inp)
+ lw $in3,12($inp)
+ beqz $tmp0,.Laligned_key
+
+ lw $tmp2,16($inp)
+ subu $tmp1,$zero,$tmp0
+# ifdef MIPSEB
+ sllv $in0,$in0,$tmp0
+ srlv $tmp3,$in1,$tmp1
+ sllv $in1,$in1,$tmp0
+ or $in0,$in0,$tmp3
+ srlv $tmp3,$in2,$tmp1
+ sllv $in2,$in2,$tmp0
+ or $in1,$in1,$tmp3
+ srlv $tmp3,$in3,$tmp1
+ sllv $in3,$in3,$tmp0
+ or $in2,$in2,$tmp3
+ srlv $tmp2,$tmp2,$tmp1
+ or $in3,$in3,$tmp2
+# else
+ srlv $in0,$in0,$tmp0
+ sllv $tmp3,$in1,$tmp1
+ srlv $in1,$in1,$tmp0
+ or $in0,$in0,$tmp3
+ sllv $tmp3,$in2,$tmp1
+ srlv $in2,$in2,$tmp0
+ or $in1,$in1,$tmp3
+ sllv $tmp3,$in3,$tmp1
+ srlv $in3,$in3,$tmp0
+ or $in2,$in2,$tmp3
+ sllv $tmp2,$tmp2,$tmp1
+ or $in3,$in3,$tmp2
+# endif
+.Laligned_key:
+#else
+ lwl $in0,0+MSB($inp)
+ lwl $in1,4+MSB($inp)
+ lwl $in2,8+MSB($inp)
+ lwl $in3,12+MSB($inp)
+ lwr $in0,0+LSB($inp)
+ lwr $in1,4+LSB($inp)
+ lwr $in2,8+LSB($inp)
+ lwr $in3,12+LSB($inp)
+#endif
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+ wsbh $in0,$in0 # byte swap
+ wsbh $in1,$in1
+ wsbh $in2,$in2
+ wsbh $in3,$in3
+ rotr $in0,$in0,16
+ rotr $in1,$in1,16
+ rotr $in2,$in2,16
+ rotr $in3,$in3,16
+# else
+ srl $tmp0,$in0,24 # byte swap
+ srl $tmp1,$in0,8
+ andi $tmp2,$in0,0xFF00
+ sll $in0,$in0,24
+ andi $tmp1,0xFF00
+ sll $tmp2,$tmp2,8
+ or $in0,$tmp0
+ srl $tmp0,$in1,24
+ or $tmp1,$tmp2
+ srl $tmp2,$in1,8
+ or $in0,$tmp1
+ andi $tmp1,$in1,0xFF00
+ sll $in1,$in1,24
+ andi $tmp2,0xFF00
+ sll $tmp1,$tmp1,8
+ or $in1,$tmp0
+ srl $tmp0,$in2,24
+ or $tmp2,$tmp1
+ srl $tmp1,$in2,8
+ or $in1,$tmp2
+ andi $tmp2,$in2,0xFF00
+ sll $in2,$in2,24
+ andi $tmp1,0xFF00
+ sll $tmp2,$tmp2,8
+ or $in2,$tmp0
+ srl $tmp0,$in3,24
+ or $tmp1,$tmp2
+ srl $tmp2,$in3,8
+ or $in2,$tmp1
+ andi $tmp1,$in3,0xFF00
+ sll $in3,$in3,24
+ andi $tmp2,0xFF00
+ sll $tmp1,$tmp1,8
+ or $in3,$tmp0
+ or $tmp2,$tmp1
+ or $in3,$tmp2
+# endif
+#endif
+ lui $tmp0,0x0fff
+ ori $tmp0,0xffff # 0x0fffffff
+ and $in0,$in0,$tmp0
+ subu $tmp0,3 # 0x0ffffffc
+ and $in1,$in1,$tmp0
+ and $in2,$in2,$tmp0
+ and $in3,$in3,$tmp0
+
+ sw $in0,20($ctx)
+ sw $in1,24($ctx)
+ sw $in2,28($ctx)
+ sw $in3,32($ctx)
+
+ srl $tmp1,$in1,2
+ srl $tmp2,$in2,2
+ srl $tmp3,$in3,2
+ addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
+ addu $in2,$in2,$tmp2
+ addu $in3,$in3,$tmp3
+ sw $in1,36($ctx)
+ sw $in2,40($ctx)
+ sw $in3,44($ctx)
+.Lno_key:
+ li $v0,0
+ jr $ra
+.end poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
+
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
+ ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
+my ($d0,$d1,$d2,$d3) =
+ ($a4,$a5,$a6,$a7);
+my $shr = $t2; # used on R6
+my $one = $t2; # used on R2
+
+$code.=<<___;
+.globl poly1305_blocks
+.align 5
+.ent poly1305_blocks
+poly1305_blocks:
+ .frame $sp,16*4,$ra
+ .mask $SAVED_REGS_MASK,-4
+ .set noreorder
+ subu $sp, $sp,4*12
+ sw $s11,4*11($sp)
+ sw $s10,4*10($sp)
+ sw $s9, 4*9($sp)
+ sw $s8, 4*8($sp)
+ sw $s7, 4*7($sp)
+ sw $s6, 4*6($sp)
+ sw $s5, 4*5($sp)
+ sw $s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ sw $s3, 4*3($sp)
+ sw $s2, 4*2($sp)
+ sw $s1, 4*1($sp)
+ sw $s0, 4*0($sp)
+___
+$code.=<<___;
+ .set reorder
+
+ srl $len,4 # number of complete blocks
+ li $one,1
+ beqz $len,.Labort
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+ andi $shr,$inp,3
+ subu $inp,$inp,$shr # align $inp
+ sll $shr,$shr,3 # byte to bit offset
+#endif
+
+ lw $h0,0($ctx) # load hash value
+ lw $h1,4($ctx)
+ lw $h2,8($ctx)
+ lw $h3,12($ctx)
+ lw $h4,16($ctx)
+
+ lw $r0,20($ctx) # load key
+ lw $r1,24($ctx)
+ lw $r2,28($ctx)
+ lw $r3,32($ctx)
+ lw $rs1,36($ctx)
+ lw $rs2,40($ctx)
+ lw $rs3,44($ctx)
+
+ sll $len,4
+ addu $len,$len,$inp # end of buffer
+ b .Loop
+
+.align 4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS32R6)
+ lw $d0,0($inp) # load input
+ lw $d1,4($inp)
+ lw $d2,8($inp)
+ lw $d3,12($inp)
+ beqz $shr,.Laligned_inp
+
+ lw $t0,16($inp)
+ subu $t1,$zero,$shr
+# ifdef MIPSEB
+ sllv $d0,$d0,$shr
+ srlv $at,$d1,$t1
+ sllv $d1,$d1,$shr
+ or $d0,$d0,$at
+ srlv $at,$d2,$t1
+ sllv $d2,$d2,$shr
+ or $d1,$d1,$at
+ srlv $at,$d3,$t1
+ sllv $d3,$d3,$shr
+ or $d2,$d2,$at
+ srlv $t0,$t0,$t1
+ or $d3,$d3,$t0
+# else
+ srlv $d0,$d0,$shr
+ sllv $at,$d1,$t1
+ srlv $d1,$d1,$shr
+ or $d0,$d0,$at
+ sllv $at,$d2,$t1
+ srlv $d2,$d2,$shr
+ or $d1,$d1,$at
+ sllv $at,$d3,$t1
+ srlv $d3,$d3,$shr
+ or $d2,$d2,$at
+ sllv $t0,$t0,$t1
+ or $d3,$d3,$t0
+# endif
+.Laligned_inp:
+#else
+ lwl $d0,0+MSB($inp) # load input
+ lwl $d1,4+MSB($inp)
+ lwl $d2,8+MSB($inp)
+ lwl $d3,12+MSB($inp)
+ lwr $d0,0+LSB($inp)
+ lwr $d1,4+LSB($inp)
+ lwr $d2,8+LSB($inp)
+ lwr $d3,12+LSB($inp)
+#endif
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+ wsbh $d0,$d0 # byte swap
+ wsbh $d1,$d1
+ wsbh $d2,$d2
+ wsbh $d3,$d3
+ rotr $d0,$d0,16
+ rotr $d1,$d1,16
+ rotr $d2,$d2,16
+ rotr $d3,$d3,16
+# else
+ srl $at,$d0,24 # byte swap
+ srl $t0,$d0,8
+ andi $t1,$d0,0xFF00
+ sll $d0,$d0,24
+ andi $t0,0xFF00
+ sll $t1,$t1,8
+ or $d0,$at
+ srl $at,$d1,24
+ or $t0,$t1
+ srl $t1,$d1,8
+ or $d0,$t0
+ andi $t0,$d1,0xFF00
+ sll $d1,$d1,24
+ andi $t1,0xFF00
+ sll $t0,$t0,8
+ or $d1,$at
+ srl $at,$d2,24
+ or $t1,$t0
+ srl $t0,$d2,8
+ or $d1,$t1
+ andi $t1,$d2,0xFF00
+ sll $d2,$d2,24
+ andi $t0,0xFF00
+ sll $t1,$t1,8
+ or $d2,$at
+ srl $at,$d3,24
+ or $t0,$t1
+ srl $t1,$d3,8
+ or $d2,$t0
+ andi $t0,$d3,0xFF00
+ sll $d3,$d3,24
+ andi $t1,0xFF00
+ sll $t0,$t0,8
+ or $d3,$at
+ or $t1,$t0
+ or $d3,$t1
+# endif
+#endif
+ srl $t0,$h4,2 # modulo-scheduled reduction
+ andi $h4,$h4,3
+ sll $at,$t0,2
+
+ addu $d0,$d0,$h0 # accumulate input
+ addu $t0,$t0,$at
+ sltu $h0,$d0,$h0
+ addu $d0,$d0,$t0 # ... and residue
+ sltu $at,$d0,$t0
+
+ addu $d1,$d1,$h1
+ addu $h0,$h0,$at # carry
+ sltu $h1,$d1,$h1
+ addu $d1,$d1,$h0
+ sltu $h0,$d1,$h0
+
+ addu $d2,$d2,$h2
+ addu $h1,$h1,$h0 # carry
+ sltu $h2,$d2,$h2
+ addu $d2,$d2,$h1
+ sltu $h1,$d2,$h1
+
+ addu $d3,$d3,$h3
+ addu $h2,$h2,$h1 # carry
+ sltu $h3,$d3,$h3
+ addu $d3,$d3,$h2
+
+#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
+ multu $r0,$d0 # d0*r0
+ sltu $h2,$d3,$h2
+ maddu $rs3,$d1 # d1*s3
+ addu $h3,$h3,$h2 # carry
+ maddu $rs2,$d2 # d2*s2
+ addu $h4,$h4,$padbit
+ maddu $rs1,$d3 # d3*s1
+ addu $h4,$h4,$h3
+ mfhi $at
+ mflo $h0
+
+ multu $r1,$d0 # d0*r1
+ maddu $r0,$d1 # d1*r0
+ maddu $rs3,$d2 # d2*s3
+ maddu $rs2,$d3 # d3*s2
+ maddu $rs1,$h4 # h4*s1
+ maddu $at,$one # hi*1
+ mfhi $at
+ mflo $h1
+
+ multu $r2,$d0 # d0*r2
+ maddu $r1,$d1 # d1*r1
+ maddu $r0,$d2 # d2*r0
+ maddu $rs3,$d3 # d3*s3
+ maddu $rs2,$h4 # h4*s2
+ maddu $at,$one # hi*1
+ mfhi $at
+ mflo $h2
+
+ mul $t0,$r0,$h4 # h4*r0
+
+ multu $r3,$d0 # d0*r3
+ maddu $r2,$d1 # d1*r2
+ maddu $r1,$d2 # d2*r1
+ maddu $r0,$d3 # d3*r0
+ maddu $rs3,$h4 # h4*s3
+ maddu $at,$one # hi*1
+ mfhi $at
+ mflo $h3
+
+ addiu $inp,$inp,16
+
+ addu $h4,$t0,$at
+#else
+ multu ($r0,$d0) # d0*r0
+ mflo ($h0,$r0,$d0)
+ mfhi ($h1,$r0,$d0)
+
+ sltu $h2,$d3,$h2
+ addu $h3,$h3,$h2 # carry
+
+ multu ($rs3,$d1) # d1*s3
+ mflo ($at,$rs3,$d1)
+ mfhi ($t0,$rs3,$d1)
+
+ addu $h4,$h4,$padbit
+ addiu $inp,$inp,16
+ addu $h4,$h4,$h3
+
+ multu ($rs2,$d2) # d2*s2
+ mflo ($a3,$rs2,$d2)
+ mfhi ($t1,$rs2,$d2)
+ addu $h0,$h0,$at
+ addu $h1,$h1,$t0
+ multu ($rs1,$d3) # d3*s1
+ sltu $at,$h0,$at
+ addu $h1,$h1,$at
+
+ mflo ($at,$rs1,$d3)
+ mfhi ($t0,$rs1,$d3)
+ addu $h0,$h0,$a3
+ addu $h1,$h1,$t1
+ multu ($r1,$d0) # d0*r1
+ sltu $a3,$h0,$a3
+ addu $h1,$h1,$a3
+
+
+ mflo ($a3,$r1,$d0)
+ mfhi ($h2,$r1,$d0)
+ addu $h0,$h0,$at
+ addu $h1,$h1,$t0
+ multu ($r0,$d1) # d1*r0
+ sltu $at,$h0,$at
+ addu $h1,$h1,$at
+
+ mflo ($at,$r0,$d1)
+ mfhi ($t0,$r0,$d1)
+ addu $h1,$h1,$a3
+ sltu $a3,$h1,$a3
+ multu ($rs3,$d2) # d2*s3
+ addu $h2,$h2,$a3
+
+ mflo ($a3,$rs3,$d2)
+ mfhi ($t1,$rs3,$d2)
+ addu $h1,$h1,$at
+ addu $h2,$h2,$t0
+ multu ($rs2,$d3) # d3*s2
+ sltu $at,$h1,$at
+ addu $h2,$h2,$at
+
+ mflo ($at,$rs2,$d3)
+ mfhi ($t0,$rs2,$d3)
+ addu $h1,$h1,$a3
+ addu $h2,$h2,$t1
+ multu ($rs1,$h4) # h4*s1
+ sltu $a3,$h1,$a3
+ addu $h2,$h2,$a3
+
+ mflo ($a3,$rs1,$h4)
+ addu $h1,$h1,$at
+ addu $h2,$h2,$t0
+ multu ($r2,$d0) # d0*r2
+ sltu $at,$h1,$at
+ addu $h2,$h2,$at
+
+
+ mflo ($at,$r2,$d0)
+ mfhi ($h3,$r2,$d0)
+ addu $h1,$h1,$a3
+ sltu $a3,$h1,$a3
+ multu ($r1,$d1) # d1*r1
+ addu $h2,$h2,$a3
+
+ mflo ($a3,$r1,$d1)
+ mfhi ($t1,$r1,$d1)
+ addu $h2,$h2,$at
+ sltu $at,$h2,$at
+ multu ($r0,$d2) # d2*r0
+ addu $h3,$h3,$at
+
+ mflo ($at,$r0,$d2)
+ mfhi ($t0,$r0,$d2)
+ addu $h2,$h2,$a3
+ addu $h3,$h3,$t1
+ multu ($rs3,$d3) # d3*s3
+ sltu $a3,$h2,$a3
+ addu $h3,$h3,$a3
+
+ mflo ($a3,$rs3,$d3)
+ mfhi ($t1,$rs3,$d3)
+ addu $h2,$h2,$at
+ addu $h3,$h3,$t0
+ multu ($rs2,$h4) # h4*s2
+ sltu $at,$h2,$at
+ addu $h3,$h3,$at
+
+ mflo ($at,$rs2,$h4)
+ addu $h2,$h2,$a3
+ addu $h3,$h3,$t1
+ multu ($r3,$d0) # d0*r3
+ sltu $a3,$h2,$a3
+ addu $h3,$h3,$a3
+
+
+ mflo ($a3,$r3,$d0)
+ mfhi ($t1,$r3,$d0)
+ addu $h2,$h2,$at
+ sltu $at,$h2,$at
+ multu ($r2,$d1) # d1*r2
+ addu $h3,$h3,$at
+
+ mflo ($at,$r2,$d1)
+ mfhi ($t0,$r2,$d1)
+ addu $h3,$h3,$a3
+ sltu $a3,$h3,$a3
+ multu ($r0,$d3) # d3*r0
+ addu $t1,$t1,$a3
+
+ mflo ($a3,$r0,$d3)
+ mfhi ($d3,$r0,$d3)
+ addu $h3,$h3,$at
+ addu $t1,$t1,$t0
+ multu ($r1,$d2) # d2*r1
+ sltu $at,$h3,$at
+ addu $t1,$t1,$at
+
+ mflo ($at,$r1,$d2)
+ mfhi ($t0,$r1,$d2)
+ addu $h3,$h3,$a3
+ addu $t1,$t1,$d3
+ multu ($rs3,$h4) # h4*s3
+ sltu $a3,$h3,$a3
+ addu $t1,$t1,$a3
+
+ mflo ($a3,$rs3,$h4)
+ addu $h3,$h3,$at
+ addu $t1,$t1,$t0
+ multu ($r0,$h4) # h4*r0
+ sltu $at,$h3,$at
+ addu $t1,$t1,$at
+
+
+ mflo ($h4,$r0,$h4)
+ addu $h3,$h3,$a3
+ sltu $a3,$h3,$a3
+ addu $t1,$t1,$a3
+ addu $h4,$h4,$t1
+
+ li $padbit,1 # if we loop, padbit is 1
+#endif
+ bne $inp,$len,.Loop
+
+ sw $h0,0($ctx) # store hash value
+ sw $h1,4($ctx)
+ sw $h2,8($ctx)
+ sw $h3,12($ctx)
+ sw $h4,16($ctx)
+
+ .set noreorder
+.Labort:
+ lw $s11,4*11($sp)
+ lw $s10,4*10($sp)
+ lw $s9, 4*9($sp)
+ lw $s8, 4*8($sp)
+ lw $s7, 4*7($sp)
+ lw $s6, 4*6($sp)
+ lw $s5, 4*5($sp)
+ lw $s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ lw $s3, 4*3($sp)
+ lw $s2, 4*2($sp)
+ lw $s1, 4*1($sp)
+ lw $s0, 4*0($sp)
+___
+$code.=<<___;
+ jr $ra
+ addu $sp,$sp,4*12
+.end poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
+
+$code.=<<___;
+.align 5
+.globl poly1305_emit
+.ent poly1305_emit
+poly1305_emit:
+ .frame $sp,0,$ra
+ .set reorder
+
+ lw $tmp4,16($ctx)
+ lw $tmp0,0($ctx)
+ lw $tmp1,4($ctx)
+ lw $tmp2,8($ctx)
+ lw $tmp3,12($ctx)
+
+ li $in0,-4 # final reduction
+ srl $ctx,$tmp4,2
+ and $in0,$in0,$tmp4
+ andi $tmp4,$tmp4,3
+ addu $ctx,$ctx,$in0
+
+ addu $tmp0,$tmp0,$ctx
+ sltu $ctx,$tmp0,$ctx
+ addiu $in0,$tmp0,5 # compare to modulus
+ addu $tmp1,$tmp1,$ctx
+ sltiu $in1,$in0,5
+ sltu $ctx,$tmp1,$ctx
+ addu $in1,$in1,$tmp1
+ addu $tmp2,$tmp2,$ctx
+ sltu $in2,$in1,$tmp1
+ sltu $ctx,$tmp2,$ctx
+ addu $in2,$in2,$tmp2
+ addu $tmp3,$tmp3,$ctx
+ sltu $in3,$in2,$tmp2
+ sltu $ctx,$tmp3,$ctx
+ addu $in3,$in3,$tmp3
+ addu $tmp4,$tmp4,$ctx
+ sltu $ctx,$in3,$tmp3
+ addu $ctx,$tmp4
+
+ srl $ctx,2 # see if it carried/borrowed
+ subu $ctx,$zero,$ctx
+
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+ xor $in2,$tmp2
+ xor $in3,$tmp3
+ and $in0,$ctx
+ and $in1,$ctx
+ and $in2,$ctx
+ and $in3,$ctx
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+ xor $in2,$tmp2
+ xor $in3,$tmp3
+
+ lw $tmp0,0($nonce) # load nonce
+ lw $tmp1,4($nonce)
+ lw $tmp2,8($nonce)
+ lw $tmp3,12($nonce)
+
+ addu $in0,$tmp0 # accumulate nonce
+ sltu $ctx,$in0,$tmp0
+
+ addu $in1,$tmp1
+ sltu $tmp1,$in1,$tmp1
+ addu $in1,$ctx
+ sltu $ctx,$in1,$ctx
+ addu $ctx,$tmp1
+
+ addu $in2,$tmp2
+ sltu $tmp2,$in2,$tmp2
+ addu $in2,$ctx
+ sltu $ctx,$in2,$ctx
+ addu $ctx,$tmp2
+
+ addu $in3,$tmp3
+ addu $in3,$ctx
+
+ srl $tmp0,$in0,8 # write mac value
+ srl $tmp1,$in0,16
+ srl $tmp2,$in0,24
+ sb $in0, 0($mac)
+ sb $tmp0,1($mac)
+ srl $tmp0,$in1,8
+ sb $tmp1,2($mac)
+ srl $tmp1,$in1,16
+ sb $tmp2,3($mac)
+ srl $tmp2,$in1,24
+ sb $in1, 4($mac)
+ sb $tmp0,5($mac)
+ srl $tmp0,$in2,8
+ sb $tmp1,6($mac)
+ srl $tmp1,$in2,16
+ sb $tmp2,7($mac)
+ srl $tmp2,$in2,24
+ sb $in2, 8($mac)
+ sb $tmp0,9($mac)
+ srl $tmp0,$in3,8
+ sb $tmp1,10($mac)
+ srl $tmp1,$in3,16
+ sb $tmp2,11($mac)
+ srl $tmp2,$in3,24
+ sb $in3, 12($mac)
+ sb $tmp0,13($mac)
+ sb $tmp1,14($mac)
+ sb $tmp2,15($mac)
+
+ jr $ra
+.end poly1305_emit
+.rdata
+.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
+.align 2
+___
+}
+}}}
+
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/lib/crypto/mips/poly1305.h b/lib/crypto/mips/poly1305.h
new file mode 100644
index 000000000000..85de450f1a93
--- /dev/null
+++ b/lib/crypto/mips/poly1305.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+ u8 digest[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
diff --git a/lib/crypto/mips/sha1.h b/lib/crypto/mips/sha1.h
new file mode 100644
index 000000000000..ba1965002e4a
--- /dev/null
+++ b/lib/crypto/mips/sha1.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * SHA1 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha1_generic.c, which is:
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void octeon_sha1_store_hash(struct sha1_block_state *state)
+{
+ u64 *hash = (u64 *)&state->h[0];
+ union {
+ u32 word[2];
+ u64 dword;
+ } hash_tail = { { state->h[4], } };
+
+ write_octeon_64bit_hash_dword(hash[0], 0);
+ write_octeon_64bit_hash_dword(hash[1], 1);
+ write_octeon_64bit_hash_dword(hash_tail.dword, 2);
+ memzero_explicit(&hash_tail.word[0], sizeof(hash_tail.word[0]));
+}
+
+static void octeon_sha1_read_hash(struct sha1_block_state *state)
+{
+ u64 *hash = (u64 *)&state->h[0];
+ union {
+ u32 word[2];
+ u64 dword;
+ } hash_tail;
+
+ hash[0] = read_octeon_64bit_hash_dword(0);
+ hash[1] = read_octeon_64bit_hash_dword(1);
+ hash_tail.dword = read_octeon_64bit_hash_dword(2);
+ state->h[4] = hash_tail.word[0];
+ memzero_explicit(&hash_tail.dword, sizeof(hash_tail.dword));
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ struct octeon_cop2_state cop2_state;
+ unsigned long flags;
+
+ if (!octeon_has_crypto())
+ return sha1_blocks_generic(state, data, nblocks);
+
+ flags = octeon_crypto_enable(&cop2_state);
+ octeon_sha1_store_hash(state);
+
+ do {
+ const u64 *block = (const u64 *)data;
+
+ write_octeon_64bit_block_dword(block[0], 0);
+ write_octeon_64bit_block_dword(block[1], 1);
+ write_octeon_64bit_block_dword(block[2], 2);
+ write_octeon_64bit_block_dword(block[3], 3);
+ write_octeon_64bit_block_dword(block[4], 4);
+ write_octeon_64bit_block_dword(block[5], 5);
+ write_octeon_64bit_block_dword(block[6], 6);
+ octeon_sha1_start(block[7]);
+
+ data += SHA1_BLOCK_SIZE;
+ } while (--nblocks);
+
+ octeon_sha1_read_hash(state);
+ octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mips/sha256.h b/lib/crypto/mips/sha256.h
new file mode 100644
index 000000000000..ccccfd131634
--- /dev/null
+++ b/lib/crypto/mips/sha256.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha256_generic.c, which is:
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ struct octeon_cop2_state cop2_state;
+ u64 *state64 = (u64 *)state;
+ unsigned long flags;
+
+ if (!octeon_has_crypto())
+ return sha256_blocks_generic(state, data, nblocks);
+
+ flags = octeon_crypto_enable(&cop2_state);
+ write_octeon_64bit_hash_dword(state64[0], 0);
+ write_octeon_64bit_hash_dword(state64[1], 1);
+ write_octeon_64bit_hash_dword(state64[2], 2);
+ write_octeon_64bit_hash_dword(state64[3], 3);
+
+ do {
+ const u64 *block = (const u64 *)data;
+
+ write_octeon_64bit_block_dword(block[0], 0);
+ write_octeon_64bit_block_dword(block[1], 1);
+ write_octeon_64bit_block_dword(block[2], 2);
+ write_octeon_64bit_block_dword(block[3], 3);
+ write_octeon_64bit_block_dword(block[4], 4);
+ write_octeon_64bit_block_dword(block[5], 5);
+ write_octeon_64bit_block_dword(block[6], 6);
+ octeon_sha256_start(block[7]);
+
+ data += SHA256_BLOCK_SIZE;
+ } while (--nblocks);
+
+ state64[0] = read_octeon_64bit_hash_dword(0);
+ state64[1] = read_octeon_64bit_hash_dword(1);
+ state64[2] = read_octeon_64bit_hash_dword(2);
+ state64[3] = read_octeon_64bit_hash_dword(3);
+ octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mips/sha512.h b/lib/crypto/mips/sha512.h
new file mode 100644
index 000000000000..b3ffbc1e8ca8
--- /dev/null
+++ b/lib/crypto/mips/sha512.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * SHA-512 and SHA-384 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha512_generic.c, which is:
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ struct octeon_cop2_state cop2_state;
+ unsigned long flags;
+
+ if (!octeon_has_crypto())
+ return sha512_blocks_generic(state, data, nblocks);
+
+ flags = octeon_crypto_enable(&cop2_state);
+ write_octeon_64bit_hash_sha512(state->h[0], 0);
+ write_octeon_64bit_hash_sha512(state->h[1], 1);
+ write_octeon_64bit_hash_sha512(state->h[2], 2);
+ write_octeon_64bit_hash_sha512(state->h[3], 3);
+ write_octeon_64bit_hash_sha512(state->h[4], 4);
+ write_octeon_64bit_hash_sha512(state->h[5], 5);
+ write_octeon_64bit_hash_sha512(state->h[6], 6);
+ write_octeon_64bit_hash_sha512(state->h[7], 7);
+
+ do {
+ const u64 *block = (const u64 *)data;
+
+ write_octeon_64bit_block_sha512(block[0], 0);
+ write_octeon_64bit_block_sha512(block[1], 1);
+ write_octeon_64bit_block_sha512(block[2], 2);
+ write_octeon_64bit_block_sha512(block[3], 3);
+ write_octeon_64bit_block_sha512(block[4], 4);
+ write_octeon_64bit_block_sha512(block[5], 5);
+ write_octeon_64bit_block_sha512(block[6], 6);
+ write_octeon_64bit_block_sha512(block[7], 7);
+ write_octeon_64bit_block_sha512(block[8], 8);
+ write_octeon_64bit_block_sha512(block[9], 9);
+ write_octeon_64bit_block_sha512(block[10], 10);
+ write_octeon_64bit_block_sha512(block[11], 11);
+ write_octeon_64bit_block_sha512(block[12], 12);
+ write_octeon_64bit_block_sha512(block[13], 13);
+ write_octeon_64bit_block_sha512(block[14], 14);
+ octeon_sha512_start(block[15]);
+
+ data += SHA512_BLOCK_SIZE;
+ } while (--nblocks);
+
+ state->h[0] = read_octeon_64bit_hash_sha512(0);
+ state->h[1] = read_octeon_64bit_hash_sha512(1);
+ state->h[2] = read_octeon_64bit_hash_sha512(2);
+ state->h[3] = read_octeon_64bit_hash_sha512(3);
+ state->h[4] = read_octeon_64bit_hash_sha512(4);
+ state->h[5] = read_octeon_64bit_hash_sha512(5);
+ state->h[6] = read_octeon_64bit_hash_sha512(6);
+ state->h[7] = read_octeon_64bit_hash_sha512(7);
+ octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mpi/mpi-add.c b/lib/crypto/mpi/mpi-add.c
index 3015140d4860..c0375c1672fa 100644
--- a/lib/crypto/mpi/mpi-add.c
+++ b/lib/crypto/mpi/mpi-add.c
@@ -11,6 +11,8 @@
* to avoid revealing of sensitive data due to paging etc.
*/
+#include <linux/export.h>
+
#include "mpi-internal.h"
int mpi_add(MPI w, MPI u, MPI v)
diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c
index 934d81311360..b3d0e7ddbc03 100644
--- a/lib/crypto/mpi/mpi-bit.c
+++ b/lib/crypto/mpi/mpi-bit.c
@@ -18,6 +18,8 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*/
+#include <linux/export.h>
+
#include "mpi-internal.h"
#include "longlong.h"
diff --git a/lib/crypto/mpi/mpi-cmp.c b/lib/crypto/mpi/mpi-cmp.c
index ceaebe181cd7..b42929296bce 100644
--- a/lib/crypto/mpi/mpi-cmp.c
+++ b/lib/crypto/mpi/mpi-cmp.c
@@ -18,6 +18,8 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*/
+#include <linux/export.h>
+
#include "mpi-internal.h"
int mpi_cmp_ui(MPI u, unsigned long v)
diff --git a/lib/crypto/mpi/mpi-mul.c b/lib/crypto/mpi/mpi-mul.c
index 7e6ff1ce3e9b..d79f186ad90b 100644
--- a/lib/crypto/mpi/mpi-mul.c
+++ b/lib/crypto/mpi/mpi-mul.c
@@ -11,6 +11,8 @@
* to avoid revealing of sensitive data due to paging etc.
*/
+#include <linux/export.h>
+
#include "mpi-internal.h"
int mpi_mul(MPI w, MPI u, MPI v)
diff --git a/lib/crypto/mpi/mpi-pow.c b/lib/crypto/mpi/mpi-pow.c
index 67fbd4c2503d..9e695a3bda3a 100644
--- a/lib/crypto/mpi/mpi-pow.c
+++ b/lib/crypto/mpi/mpi-pow.c
@@ -13,8 +13,10 @@
* however I decided to publish this code under the plain GPL.
*/
+#include <linux/export.h>
#include <linux/sched.h>
#include <linux/string.h>
+
#include "mpi-internal.h"
#include "longlong.h"
diff --git a/lib/crypto/mpi/mpi-sub-ui.c b/lib/crypto/mpi/mpi-sub-ui.c
index b41b082b5f3e..0edcdbd24833 100644
--- a/lib/crypto/mpi/mpi-sub-ui.c
+++ b/lib/crypto/mpi/mpi-sub-ui.c
@@ -32,6 +32,8 @@
* see https://www.gnu.org/licenses/.
*/
+#include <linux/export.h>
+
#include "mpi-internal.h"
int mpi_sub_ui(MPI w, MPI u, unsigned long vval)
diff --git a/lib/crypto/mpi/mpicoder.c b/lib/crypto/mpi/mpicoder.c
index dde01030807d..47f6939599b3 100644
--- a/lib/crypto/mpi/mpicoder.c
+++ b/lib/crypto/mpi/mpicoder.c
@@ -19,8 +19,9 @@
*/
#include <linux/bitops.h>
-#include <linux/count_zeros.h>
#include <linux/byteorder/generic.h>
+#include <linux/count_zeros.h>
+#include <linux/export.h>
#include <linux/scatterlist.h>
#include <linux/string.h>
#include "mpi-internal.h"
diff --git a/lib/crypto/mpi/mpiutil.c b/lib/crypto/mpi/mpiutil.c
index 979ece5a81d2..7f2db830f404 100644
--- a/lib/crypto/mpi/mpiutil.c
+++ b/lib/crypto/mpi/mpiutil.c
@@ -18,6 +18,8 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
*/
+#include <linux/export.h>
+
#include "mpi-internal.h"
/****************
diff --git a/lib/crypto/poly1305-donna32.c b/lib/crypto/poly1305-donna32.c
index 0a4a2d99e365..b66131b3f6d4 100644
--- a/lib/crypto/poly1305-donna32.c
+++ b/lib/crypto/poly1305-donna32.c
@@ -6,9 +6,10 @@
* public domain.
*/
+#include <crypto/internal/poly1305.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/unaligned.h>
-#include <crypto/internal/poly1305.h>
void poly1305_core_setkey(struct poly1305_core_key *key,
const u8 raw_key[POLY1305_BLOCK_SIZE])
diff --git a/lib/crypto/poly1305-donna64.c b/lib/crypto/poly1305-donna64.c
index 530287531b2e..8a72a5a84944 100644
--- a/lib/crypto/poly1305-donna64.c
+++ b/lib/crypto/poly1305-donna64.c
@@ -6,9 +6,10 @@
* public domain.
*/
+#include <crypto/internal/poly1305.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/unaligned.h>
-#include <crypto/internal/poly1305.h>
void poly1305_core_setkey(struct poly1305_core_key *key,
const u8 raw_key[POLY1305_BLOCK_SIZE])
diff --git a/lib/crypto/poly1305-generic.c b/lib/crypto/poly1305-generic.c
deleted file mode 100644
index a73f700fa1fb..000000000000
--- a/lib/crypto/poly1305-generic.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Poly1305 authenticator algorithm, RFC7539
- *
- * Copyright (C) 2015 Martin Willi
- *
- * Based on public domain code by Andrew Moon and Daniel J. Bernstein.
- */
-
-#include <crypto/internal/poly1305.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-void poly1305_block_init_generic(struct poly1305_block_state *desc,
- const u8 raw_key[POLY1305_BLOCK_SIZE])
-{
- poly1305_core_init(&desc->h);
- poly1305_core_setkey(&desc->core_r, raw_key);
-}
-EXPORT_SYMBOL_GPL(poly1305_block_init_generic);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("Poly1305 algorithm (generic implementation)");
diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c
index 5f2f2af3b59f..f313ccc4b4dd 100644
--- a/lib/crypto/poly1305.c
+++ b/lib/crypto/poly1305.c
@@ -7,13 +7,21 @@
* Based on public domain code by Andrew Moon and Daniel J. Bernstein.
*/
-#include <crypto/internal/blockhash.h>
#include <crypto/internal/poly1305.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/unaligned.h>
+#ifdef CONFIG_CRYPTO_LIB_POLY1305_ARCH
+#include "poly1305.h" /* $(SRCARCH)/poly1305.h */
+#else
+#define poly1305_block_init poly1305_block_init_generic
+#define poly1305_blocks poly1305_blocks_generic
+#define poly1305_emit poly1305_emit_generic
+#endif
+
void poly1305_init(struct poly1305_desc_ctx *desc,
const u8 key[POLY1305_KEY_SIZE])
{
@@ -22,28 +30,40 @@ void poly1305_init(struct poly1305_desc_ctx *desc,
desc->s[2] = get_unaligned_le32(key + 24);
desc->s[3] = get_unaligned_le32(key + 28);
desc->buflen = 0;
- if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
- poly1305_block_init_arch(&desc->state, key);
- else
- poly1305_block_init_generic(&desc->state, key);
+ poly1305_block_init(&desc->state, key);
}
EXPORT_SYMBOL(poly1305_init);
-static inline void poly1305_blocks(struct poly1305_block_state *state,
- const u8 *src, unsigned int len)
-{
- if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
- poly1305_blocks_arch(state, src, len, 1);
- else
- poly1305_blocks_generic(state, src, len, 1);
-}
-
void poly1305_update(struct poly1305_desc_ctx *desc,
const u8 *src, unsigned int nbytes)
{
- desc->buflen = BLOCK_HASH_UPDATE(poly1305_blocks, &desc->state,
- src, nbytes, POLY1305_BLOCK_SIZE,
- desc->buf, desc->buflen);
+ if (desc->buflen + nbytes >= POLY1305_BLOCK_SIZE) {
+ unsigned int bulk_len;
+
+ if (desc->buflen) {
+ unsigned int l = POLY1305_BLOCK_SIZE - desc->buflen;
+
+ memcpy(&desc->buf[desc->buflen], src, l);
+ src += l;
+ nbytes -= l;
+
+ poly1305_blocks(&desc->state, desc->buf,
+ POLY1305_BLOCK_SIZE, 1);
+ desc->buflen = 0;
+ }
+
+ bulk_len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+ nbytes %= POLY1305_BLOCK_SIZE;
+
+ if (bulk_len) {
+ poly1305_blocks(&desc->state, src, bulk_len, 1);
+ src += bulk_len;
+ }
+ }
+ if (nbytes) {
+ memcpy(&desc->buf[desc->buflen], src, nbytes);
+ desc->buflen += nbytes;
+ }
}
EXPORT_SYMBOL(poly1305_update);
@@ -53,22 +73,28 @@ void poly1305_final(struct poly1305_desc_ctx *desc, u8 *dst)
desc->buf[desc->buflen++] = 1;
memset(desc->buf + desc->buflen, 0,
POLY1305_BLOCK_SIZE - desc->buflen);
- if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
- poly1305_blocks_arch(&desc->state, desc->buf,
- POLY1305_BLOCK_SIZE, 0);
- else
- poly1305_blocks_generic(&desc->state, desc->buf,
- POLY1305_BLOCK_SIZE, 0);
+ poly1305_blocks(&desc->state, desc->buf, POLY1305_BLOCK_SIZE,
+ 0);
}
- if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
- poly1305_emit_arch(&desc->state.h, dst, desc->s);
- else
- poly1305_emit_generic(&desc->state.h, dst, desc->s);
+ poly1305_emit(&desc->state.h, dst, desc->s);
*desc = (struct poly1305_desc_ctx){};
}
EXPORT_SYMBOL(poly1305_final);
+#ifdef poly1305_mod_init_arch
+static int __init poly1305_mod_init(void)
+{
+ poly1305_mod_init_arch();
+ return 0;
+}
+subsys_initcall(poly1305_mod_init);
+
+static void __exit poly1305_mod_exit(void)
+{
+}
+module_exit(poly1305_mod_exit);
+#endif
+
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
MODULE_DESCRIPTION("Poly1305 authenticator algorithm, RFC7539");
diff --git a/lib/crypto/polyval.c b/lib/crypto/polyval.c
new file mode 100644
index 000000000000..5796275f574a
--- /dev/null
+++ b/lib/crypto/polyval.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * POLYVAL library functions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/polyval.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+/*
+ * POLYVAL is an almost-XOR-universal hash function. Similar to GHASH, POLYVAL
+ * interprets the message as the coefficients of a polynomial in GF(2^128) and
+ * evaluates that polynomial at a secret point. POLYVAL has a simple
+ * mathematical relationship with GHASH, but it uses a better field convention
+ * which makes it easier and faster to implement.
+ *
+ * POLYVAL is not a cryptographic hash function, and it should be used only by
+ * algorithms that are specifically designed to use it.
+ *
+ * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated
+ * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)
+ *
+ * POLYVAL is also used by HCTR2. See "Length-preserving encryption with HCTR2"
+ * (https://eprint.iacr.org/2021/1441.pdf).
+ *
+ * This file provides a library API for POLYVAL. This API can delegate to
+ * either a generic implementation or an architecture-optimized implementation.
+ *
+ * For the generic implementation, we don't use the traditional table approach
+ * to GF(2^128) multiplication. That approach is not constant-time and requires
+ * a lot of memory. Instead, we use a different approach which emulates
+ * carryless multiplication using standard multiplications by spreading the data
+ * bits apart using "holes". This allows the carries to spill harmlessly. This
+ * approach is borrowed from BoringSSL, which in turn credits BearSSL's
+ * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
+ * "holes" trick and a presentation by Shay Gueron
+ * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
+ * 256-bit => 128-bit reduction algorithm.
+ */
+
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+ /*
+ * With 64-bit multiplicands and one term every 4 bits, there would be
+ * up to 64 / 4 = 16 one bits per column when each multiplication is
+ * written out as a series of additions in the schoolbook manner.
+ * Unfortunately, that doesn't work since the value 16 is 1 too large to
+ * fit in 4 bits. Carries would sometimes overflow into the next term.
+ *
+ * Using one term every 5 bits would work. However, that would cost
+ * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
+ *
+ * Instead, mask off 4 bits from one multiplicand, giving a max of 15
+ * one bits per column. Then handle those 4 bits separately.
+ */
+ u64 a0 = a & 0x1111111111111110;
+ u64 a1 = a & 0x2222222222222220;
+ u64 a2 = a & 0x4444444444444440;
+ u64 a3 = a & 0x8888888888888880;
+
+ u64 b0 = b & 0x1111111111111111;
+ u64 b1 = b & 0x2222222222222222;
+ u64 b2 = b & 0x4444444444444444;
+ u64 b3 = b & 0x8888888888888888;
+
+ /* Multiply the high 60 bits of @a by @b. */
+ u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
+ (a2 * (u128)b2) ^ (a3 * (u128)b1);
+ u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
+ (a2 * (u128)b3) ^ (a3 * (u128)b2);
+ u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
+ (a2 * (u128)b0) ^ (a3 * (u128)b3);
+ u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
+ (a2 * (u128)b1) ^ (a3 * (u128)b0);
+
+ /* Multiply the low 4 bits of @a by @b. */
+ u64 e0 = -(a & 1) & b;
+ u64 e1 = -((a >> 1) & 1) & b;
+ u64 e2 = -((a >> 2) & 1) & b;
+ u64 e3 = -((a >> 3) & 1) & b;
+ u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
+ u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
+
+ /* Add all the intermediate products together. */
+ *out_lo = (((u64)c0) & 0x1111111111111111) ^
+ (((u64)c1) & 0x2222222222222222) ^
+ (((u64)c2) & 0x4444444444444444) ^
+ (((u64)c3) & 0x8888888888888888) ^ extra_lo;
+ *out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
+ (((u64)(c1 >> 64)) & 0x2222222222222222) ^
+ (((u64)(c2 >> 64)) & 0x4444444444444444) ^
+ (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
+}
+
+#else /* CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Do a 32 x 32 => 64 bit carryless multiplication. */
+static u64 clmul32(u32 a, u32 b)
+{
+ /*
+ * With 32-bit multiplicands and one term every 4 bits, there are up to
+ * 32 / 4 = 8 one bits per column when each multiplication is written
+ * out as a series of additions in the schoolbook manner. The value 8
+ * fits in 4 bits, so the carries don't overflow into the next term.
+ */
+ u32 a0 = a & 0x11111111;
+ u32 a1 = a & 0x22222222;
+ u32 a2 = a & 0x44444444;
+ u32 a3 = a & 0x88888888;
+
+ u32 b0 = b & 0x11111111;
+ u32 b1 = b & 0x22222222;
+ u32 b2 = b & 0x44444444;
+ u32 b3 = b & 0x88888888;
+
+ u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
+ (a2 * (u64)b2) ^ (a3 * (u64)b1);
+ u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
+ (a2 * (u64)b3) ^ (a3 * (u64)b2);
+ u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
+ (a2 * (u64)b0) ^ (a3 * (u64)b3);
+ u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
+ (a2 * (u64)b1) ^ (a3 * (u64)b0);
+
+ /* Add all the intermediate products together. */
+ return (c0 & 0x1111111111111111) ^
+ (c1 & 0x2222222222222222) ^
+ (c2 & 0x4444444444444444) ^
+ (c3 & 0x8888888888888888);
+}
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+ u32 a_lo = (u32)a;
+ u32 a_hi = a >> 32;
+ u32 b_lo = (u32)b;
+ u32 b_hi = b >> 32;
+
+ /* Karatsuba multiplication */
+ u64 lo = clmul32(a_lo, b_lo);
+ u64 hi = clmul32(a_hi, b_hi);
+ u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
+
+ *out_lo = lo ^ (mi << 32);
+ *out_hi = hi ^ (mi >> 32);
+}
+#endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
+static void __maybe_unused
+polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
+{
+ u64 c0, c1, c2, c3, mi0, mi1;
+
+ /*
+ * Carryless-multiply @a by @b using Karatsuba multiplication. Store
+ * the 256-bit product in @c0 (low) through @c3 (high).
+ */
+ clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
+ clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
+ clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
+ &mi0, &mi1);
+ mi0 ^= c0 ^ c2;
+ mi1 ^= c1 ^ c3;
+ c1 ^= mi0;
+ c2 ^= mi1;
+
+ /*
+ * Cancel out the low 128 bits of the product by adding multiples of
+ * G(x) = x^128 + x^127 + x^126 + x^121 + 1. Do this in two steps, each
+ * of which cancels out 64 bits. Note that we break G(x) into three
+ * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
+ */
+
+ /*
+ * First, add G(x) times c0 as follows:
+ *
+ * (c0, c1, c2) = (0,
+ * c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
+ * c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
+ */
+ c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
+ c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
+
+ /*
+ * Second, add G(x) times the new c1:
+ *
+ * (c1, c2, c3) = (0,
+ * c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
+ * c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
+ */
+ c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
+ c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
+
+ /* Return (c2, c3). This implicitly multiplies by x^-128. */
+ a->lo = cpu_to_le64(c2);
+ a->hi = cpu_to_le64(c3);
+}
+
+static void __maybe_unused
+polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ acc->lo ^= get_unaligned((__le64 *)data);
+ acc->hi ^= get_unaligned((__le64 *)(data + 8));
+ polyval_mul_generic(acc, key);
+ data += POLYVAL_BLOCK_SIZE;
+ } while (--nblocks);
+}
+
+/* Include the arch-optimized implementation of POLYVAL, if one is available. */
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#include "polyval.h" /* $(SRCARCH)/polyval.h */
+void polyval_preparekey(struct polyval_key *key,
+ const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+ polyval_preparekey_arch(key, raw_key);
+}
+EXPORT_SYMBOL_GPL(polyval_preparekey);
+#endif /* Else, polyval_preparekey() is an inline function. */
+
+/*
+ * polyval_mul_generic() and polyval_blocks_generic() take the key as a
+ * polyval_elem rather than a polyval_key, so that arch-optimized
+ * implementations with a different key format can use it as a fallback (if they
+ * have H^1 stored somewhere in their struct). Thus, the following dispatch
+ * code is needed to pass the appropriate key argument.
+ */
+
+static void polyval_mul(struct polyval_ctx *ctx)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+ polyval_mul_arch(&ctx->acc, ctx->key);
+#else
+ polyval_mul_generic(&ctx->acc, &ctx->key->h);
+#endif
+}
+
+static void polyval_blocks(struct polyval_ctx *ctx,
+ const u8 *data, size_t nblocks)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+ polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
+#else
+ polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
+#endif
+}
+
+void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
+{
+ if (unlikely(ctx->partial)) {
+ size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
+
+ len -= n;
+ while (n--)
+ ctx->acc.bytes[ctx->partial++] ^= *data++;
+ if (ctx->partial < POLYVAL_BLOCK_SIZE)
+ return;
+ polyval_mul(ctx);
+ }
+ if (len >= POLYVAL_BLOCK_SIZE) {
+ size_t nblocks = len / POLYVAL_BLOCK_SIZE;
+
+ polyval_blocks(ctx, data, nblocks);
+ data += len & ~(POLYVAL_BLOCK_SIZE - 1);
+ len &= POLYVAL_BLOCK_SIZE - 1;
+ }
+ for (size_t i = 0; i < len; i++)
+ ctx->acc.bytes[i] ^= data[i];
+ ctx->partial = len;
+}
+EXPORT_SYMBOL_GPL(polyval_update);
+
+void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
+{
+ if (unlikely(ctx->partial))
+ polyval_mul(ctx);
+ memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(polyval_final);
+
+#ifdef polyval_mod_init_arch
+static int __init polyval_mod_init(void)
+{
+ polyval_mod_init_arch();
+ return 0;
+}
+subsys_initcall(polyval_mod_init);
+
+static void __exit polyval_mod_exit(void)
+{
+}
+module_exit(polyval_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/powerpc/chacha-p10le-8x.S b/lib/crypto/powerpc/chacha-p10le-8x.S
new file mode 100644
index 000000000000..b29562bd5d40
--- /dev/null
+++ b/lib/crypto/powerpc/chacha-p10le-8x.S
@@ -0,0 +1,840 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated chacha20 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# do rounds, 8 quarter rounds
+# 1. a += b; d ^= a; d <<<= 16;
+# 2. c += d; b ^= c; b <<<= 12;
+# 3. a += b; d ^= a; d <<<= 8;
+# 4. c += d; b ^= c; b <<<= 7
+#
+# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
+# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
+# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
+# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
+#
+# 4 blocks (a b c d)
+#
+# a0 b0 c0 d0
+# a1 b1 c1 d1
+# ...
+# a4 b4 c4 d4
+# ...
+# a8 b8 c8 d8
+# ...
+# a12 b12 c12 d12
+# a13 ...
+# a14 ...
+# a15 b15 c15 d15
+#
+# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+#
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine "any"
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+ std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ stvx \VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ stxvx \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+ ld \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ lxvx \VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-752(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ addi 9, 1, 256
+ SAVE_VRS 20, 0, 9
+ SAVE_VRS 21, 16, 9
+ SAVE_VRS 22, 32, 9
+ SAVE_VRS 23, 48, 9
+ SAVE_VRS 24, 64, 9
+ SAVE_VRS 25, 80, 9
+ SAVE_VRS 26, 96, 9
+ SAVE_VRS 27, 112, 9
+ SAVE_VRS 28, 128, 9
+ SAVE_VRS 29, 144, 9
+ SAVE_VRS 30, 160, 9
+ SAVE_VRS 31, 176, 9
+
+ SAVE_VSX 14, 192, 9
+ SAVE_VSX 15, 208, 9
+ SAVE_VSX 16, 224, 9
+ SAVE_VSX 17, 240, 9
+ SAVE_VSX 18, 256, 9
+ SAVE_VSX 19, 272, 9
+ SAVE_VSX 20, 288, 9
+ SAVE_VSX 21, 304, 9
+ SAVE_VSX 22, 320, 9
+ SAVE_VSX 23, 336, 9
+ SAVE_VSX 24, 352, 9
+ SAVE_VSX 25, 368, 9
+ SAVE_VSX 26, 384, 9
+ SAVE_VSX 27, 400, 9
+ SAVE_VSX 28, 416, 9
+ SAVE_VSX 29, 432, 9
+ SAVE_VSX 30, 448, 9
+ SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+ addi 9, 1, 256
+ RESTORE_VRS 20, 0, 9
+ RESTORE_VRS 21, 16, 9
+ RESTORE_VRS 22, 32, 9
+ RESTORE_VRS 23, 48, 9
+ RESTORE_VRS 24, 64, 9
+ RESTORE_VRS 25, 80, 9
+ RESTORE_VRS 26, 96, 9
+ RESTORE_VRS 27, 112, 9
+ RESTORE_VRS 28, 128, 9
+ RESTORE_VRS 29, 144, 9
+ RESTORE_VRS 30, 160, 9
+ RESTORE_VRS 31, 176, 9
+
+ RESTORE_VSX 14, 192, 9
+ RESTORE_VSX 15, 208, 9
+ RESTORE_VSX 16, 224, 9
+ RESTORE_VSX 17, 240, 9
+ RESTORE_VSX 18, 256, 9
+ RESTORE_VSX 19, 272, 9
+ RESTORE_VSX 20, 288, 9
+ RESTORE_VSX 21, 304, 9
+ RESTORE_VSX 22, 320, 9
+ RESTORE_VSX 23, 336, 9
+ RESTORE_VSX 24, 352, 9
+ RESTORE_VSX 25, 368, 9
+ RESTORE_VSX 26, 384, 9
+ RESTORE_VSX 27, 400, 9
+ RESTORE_VSX 28, 416, 9
+ RESTORE_VSX 29, 432, 9
+ RESTORE_VSX 30, 448, 9
+ RESTORE_VSX 31, 464, 9
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 752
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+.macro QT_loop_8x
+ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 20, 20
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vadduwm 16, 16, 20
+ vadduwm 17, 17, 21
+ vadduwm 18, 18, 22
+ vadduwm 19, 19, 23
+
+ vpermxor 12, 12, 0, 25
+ vpermxor 13, 13, 1, 25
+ vpermxor 14, 14, 2, 25
+ vpermxor 15, 15, 3, 25
+ vpermxor 28, 28, 16, 25
+ vpermxor 29, 29, 17, 25
+ vpermxor 30, 30, 18, 25
+ vpermxor 31, 31, 19, 25
+ xxlor 32+25, 0, 0
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vadduwm 24, 24, 28
+ vadduwm 25, 25, 29
+ vadduwm 26, 26, 30
+ vadduwm 27, 27, 31
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vxor 20, 20, 24
+ vxor 21, 21, 25
+ vxor 22, 22, 26
+ vxor 23, 23, 27
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 21, 21
+ vrlw 4, 4, 25 #
+ vrlw 5, 5, 25
+ vrlw 6, 6, 25
+ vrlw 7, 7, 25
+ vrlw 20, 20, 25 #
+ vrlw 21, 21, 25
+ vrlw 22, 22, 25
+ vrlw 23, 23, 25
+ xxlor 32+25, 0, 0
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vadduwm 16, 16, 20
+ vadduwm 17, 17, 21
+ vadduwm 18, 18, 22
+ vadduwm 19, 19, 23
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 22, 22
+ vpermxor 12, 12, 0, 25
+ vpermxor 13, 13, 1, 25
+ vpermxor 14, 14, 2, 25
+ vpermxor 15, 15, 3, 25
+ vpermxor 28, 28, 16, 25
+ vpermxor 29, 29, 17, 25
+ vpermxor 30, 30, 18, 25
+ vpermxor 31, 31, 19, 25
+ xxlor 32+25, 0, 0
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vadduwm 24, 24, 28
+ vadduwm 25, 25, 29
+ vadduwm 26, 26, 30
+ vadduwm 27, 27, 31
+ xxlor 0, 32+28, 32+28
+ xxlor 32+28, 23, 23
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vxor 20, 20, 24
+ vxor 21, 21, 25
+ vxor 22, 22, 26
+ vxor 23, 23, 27
+ vrlw 4, 4, 28 #
+ vrlw 5, 5, 28
+ vrlw 6, 6, 28
+ vrlw 7, 7, 28
+ vrlw 20, 20, 28 #
+ vrlw 21, 21, 28
+ vrlw 22, 22, 28
+ vrlw 23, 23, 28
+ xxlor 32+28, 0, 0
+
+ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 20, 20
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vadduwm 16, 16, 21
+ vadduwm 17, 17, 22
+ vadduwm 18, 18, 23
+ vadduwm 19, 19, 20
+
+ vpermxor 15, 15, 0, 25
+ vpermxor 12, 12, 1, 25
+ vpermxor 13, 13, 2, 25
+ vpermxor 14, 14, 3, 25
+ vpermxor 31, 31, 16, 25
+ vpermxor 28, 28, 17, 25
+ vpermxor 29, 29, 18, 25
+ vpermxor 30, 30, 19, 25
+
+ xxlor 32+25, 0, 0
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vadduwm 26, 26, 31
+ vadduwm 27, 27, 28
+ vadduwm 24, 24, 29
+ vadduwm 25, 25, 30
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vxor 21, 21, 26
+ vxor 22, 22, 27
+ vxor 23, 23, 24
+ vxor 20, 20, 25
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 21, 21
+ vrlw 5, 5, 25
+ vrlw 6, 6, 25
+ vrlw 7, 7, 25
+ vrlw 4, 4, 25
+ vrlw 21, 21, 25
+ vrlw 22, 22, 25
+ vrlw 23, 23, 25
+ vrlw 20, 20, 25
+ xxlor 32+25, 0, 0
+
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vadduwm 16, 16, 21
+ vadduwm 17, 17, 22
+ vadduwm 18, 18, 23
+ vadduwm 19, 19, 20
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 22, 22
+ vpermxor 15, 15, 0, 25
+ vpermxor 12, 12, 1, 25
+ vpermxor 13, 13, 2, 25
+ vpermxor 14, 14, 3, 25
+ vpermxor 31, 31, 16, 25
+ vpermxor 28, 28, 17, 25
+ vpermxor 29, 29, 18, 25
+ vpermxor 30, 30, 19, 25
+ xxlor 32+25, 0, 0
+
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vadduwm 26, 26, 31
+ vadduwm 27, 27, 28
+ vadduwm 24, 24, 29
+ vadduwm 25, 25, 30
+
+ xxlor 0, 32+28, 32+28
+ xxlor 32+28, 23, 23
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vxor 21, 21, 26
+ vxor 22, 22, 27
+ vxor 23, 23, 24
+ vxor 20, 20, 25
+ vrlw 5, 5, 28
+ vrlw 6, 6, 28
+ vrlw 7, 7, 28
+ vrlw 4, 4, 28
+ vrlw 21, 21, 28
+ vrlw 22, 22, 28
+ vrlw 23, 23, 28
+ vrlw 20, 20, 28
+ xxlor 32+28, 0, 0
+.endm
+
+.macro QT_loop_4x
+ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vpermxor 12, 12, 0, 20
+ vpermxor 13, 13, 1, 20
+ vpermxor 14, 14, 2, 20
+ vpermxor 15, 15, 3, 20
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vrlw 4, 4, 21
+ vrlw 5, 5, 21
+ vrlw 6, 6, 21
+ vrlw 7, 7, 21
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vpermxor 12, 12, 0, 22
+ vpermxor 13, 13, 1, 22
+ vpermxor 14, 14, 2, 22
+ vpermxor 15, 15, 3, 22
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vrlw 4, 4, 23
+ vrlw 5, 5, 23
+ vrlw 6, 6, 23
+ vrlw 7, 7, 23
+
+ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vpermxor 15, 15, 0, 20
+ vpermxor 12, 12, 1, 20
+ vpermxor 13, 13, 2, 20
+ vpermxor 14, 14, 3, 20
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vrlw 5, 5, 21
+ vrlw 6, 6, 21
+ vrlw 7, 7, 21
+ vrlw 4, 4, 21
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vpermxor 15, 15, 0, 22
+ vpermxor 12, 12, 1, 22
+ vpermxor 13, 13, 2, 22
+ vpermxor 14, 14, 3, 22
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vrlw 5, 5, 23
+ vrlw 6, 6, 23
+ vrlw 7, 7, 23
+ vrlw 4, 4, 23
+.endm
+
+# Transpose
+.macro TP_4x a0 a1 a2 a3
+ xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
+ xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
+ xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
+ xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
+ xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
+ xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
+ xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
+ xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
+.endm
+
+# key stream = working state + state
+.macro Add_state S
+ vadduwm \S+0, \S+0, 16-\S
+ vadduwm \S+4, \S+4, 17-\S
+ vadduwm \S+8, \S+8, 18-\S
+ vadduwm \S+12, \S+12, 19-\S
+
+ vadduwm \S+1, \S+1, 16-\S
+ vadduwm \S+5, \S+5, 17-\S
+ vadduwm \S+9, \S+9, 18-\S
+ vadduwm \S+13, \S+13, 19-\S
+
+ vadduwm \S+2, \S+2, 16-\S
+ vadduwm \S+6, \S+6, 17-\S
+ vadduwm \S+10, \S+10, 18-\S
+ vadduwm \S+14, \S+14, 19-\S
+
+ vadduwm \S+3, \S+3, 16-\S
+ vadduwm \S+7, \S+7, 17-\S
+ vadduwm \S+11, \S+11, 18-\S
+ vadduwm \S+15, \S+15, 19-\S
+.endm
+
+#
+# write 256 bytes
+#
+.macro Write_256 S
+ add 9, 14, 5
+ add 16, 14, 4
+ lxvw4x 0, 0, 9
+ lxvw4x 1, 17, 9
+ lxvw4x 2, 18, 9
+ lxvw4x 3, 19, 9
+ lxvw4x 4, 20, 9
+ lxvw4x 5, 21, 9
+ lxvw4x 6, 22, 9
+ lxvw4x 7, 23, 9
+ lxvw4x 8, 24, 9
+ lxvw4x 9, 25, 9
+ lxvw4x 10, 26, 9
+ lxvw4x 11, 27, 9
+ lxvw4x 12, 28, 9
+ lxvw4x 13, 29, 9
+ lxvw4x 14, 30, 9
+ lxvw4x 15, 31, 9
+
+ xxlxor \S+32, \S+32, 0
+ xxlxor \S+36, \S+36, 1
+ xxlxor \S+40, \S+40, 2
+ xxlxor \S+44, \S+44, 3
+ xxlxor \S+33, \S+33, 4
+ xxlxor \S+37, \S+37, 5
+ xxlxor \S+41, \S+41, 6
+ xxlxor \S+45, \S+45, 7
+ xxlxor \S+34, \S+34, 8
+ xxlxor \S+38, \S+38, 9
+ xxlxor \S+42, \S+42, 10
+ xxlxor \S+46, \S+46, 11
+ xxlxor \S+35, \S+35, 12
+ xxlxor \S+39, \S+39, 13
+ xxlxor \S+43, \S+43, 14
+ xxlxor \S+47, \S+47, 15
+
+ stxvw4x \S+32, 0, 16
+ stxvw4x \S+36, 17, 16
+ stxvw4x \S+40, 18, 16
+ stxvw4x \S+44, 19, 16
+
+ stxvw4x \S+33, 20, 16
+ stxvw4x \S+37, 21, 16
+ stxvw4x \S+41, 22, 16
+ stxvw4x \S+45, 23, 16
+
+ stxvw4x \S+34, 24, 16
+ stxvw4x \S+38, 25, 16
+ stxvw4x \S+42, 26, 16
+ stxvw4x \S+46, 27, 16
+
+ stxvw4x \S+35, 28, 16
+ stxvw4x \S+39, 29, 16
+ stxvw4x \S+43, 30, 16
+ stxvw4x \S+47, 31, 16
+
+.endm
+
+#
+# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src,
+# unsigned int len, int nrounds);
+#
+SYM_FUNC_START(chacha_p10le_8x)
+.align 5
+ cmpdi 6, 0
+ ble Out_no_chacha
+
+ SAVE_REGS
+
+ # r17 - r31 mainly for Write_256 macro.
+ li 17, 16
+ li 18, 32
+ li 19, 48
+ li 20, 64
+ li 21, 80
+ li 22, 96
+ li 23, 112
+ li 24, 128
+ li 25, 144
+ li 26, 160
+ li 27, 176
+ li 28, 192
+ li 29, 208
+ li 30, 224
+ li 31, 240
+
+ mr 15, 6 # len
+ li 14, 0 # offset to inp and outp
+
+ lxvw4x 48, 0, 3 # vr16, constants
+ lxvw4x 49, 17, 3 # vr17, key 1
+ lxvw4x 50, 18, 3 # vr18, key 2
+ lxvw4x 51, 19, 3 # vr19, counter, nonce
+
+ # create (0, 1, 2, 3) counters
+ vspltisw 0, 0
+ vspltisw 1, 1
+ vspltisw 2, 2
+ vspltisw 3, 3
+ vmrghw 4, 0, 1
+ vmrglw 5, 2, 3
+ vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
+
+ vspltisw 21, 12
+ vspltisw 23, 7
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxvw4x 32+20, 0, 11
+ lxvw4x 32+22, 17, 11
+
+ sradi 8, 7, 1
+
+ mtctr 8
+
+ # save constants to vsx
+ xxlor 16, 48, 48
+ xxlor 17, 49, 49
+ xxlor 18, 50, 50
+ xxlor 19, 51, 51
+
+ vspltisw 25, 4
+ vspltisw 26, 8
+
+ xxlor 25, 32+26, 32+26
+ xxlor 24, 32+25, 32+25
+
+ vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
+ xxlor 30, 32+30, 32+30
+ xxlor 31, 32+31, 32+31
+
+ xxlor 20, 32+20, 32+20
+ xxlor 21, 32+21, 32+21
+ xxlor 22, 32+22, 32+22
+ xxlor 23, 32+23, 32+23
+
+ cmpdi 6, 512
+ blt Loop_last
+
+Loop_8x:
+ xxspltw 32+0, 16, 0
+ xxspltw 32+1, 16, 1
+ xxspltw 32+2, 16, 2
+ xxspltw 32+3, 16, 3
+
+ xxspltw 32+4, 17, 0
+ xxspltw 32+5, 17, 1
+ xxspltw 32+6, 17, 2
+ xxspltw 32+7, 17, 3
+ xxspltw 32+8, 18, 0
+ xxspltw 32+9, 18, 1
+ xxspltw 32+10, 18, 2
+ xxspltw 32+11, 18, 3
+ xxspltw 32+12, 19, 0
+ xxspltw 32+13, 19, 1
+ xxspltw 32+14, 19, 2
+ xxspltw 32+15, 19, 3
+ vadduwm 12, 12, 30 # increase counter
+
+ xxspltw 32+16, 16, 0
+ xxspltw 32+17, 16, 1
+ xxspltw 32+18, 16, 2
+ xxspltw 32+19, 16, 3
+
+ xxspltw 32+20, 17, 0
+ xxspltw 32+21, 17, 1
+ xxspltw 32+22, 17, 2
+ xxspltw 32+23, 17, 3
+ xxspltw 32+24, 18, 0
+ xxspltw 32+25, 18, 1
+ xxspltw 32+26, 18, 2
+ xxspltw 32+27, 18, 3
+ xxspltw 32+28, 19, 0
+ xxspltw 32+29, 19, 1
+ vadduwm 28, 28, 31 # increase counter
+ xxspltw 32+30, 19, 2
+ xxspltw 32+31, 19, 3
+
+.align 5
+quarter_loop_8x:
+ QT_loop_8x
+
+ bdnz quarter_loop_8x
+
+ xxlor 0, 32+30, 32+30
+ xxlor 32+30, 30, 30
+ vadduwm 12, 12, 30
+ xxlor 32+30, 0, 0
+ TP_4x 0, 1, 2, 3
+ TP_4x 4, 5, 6, 7
+ TP_4x 8, 9, 10, 11
+ TP_4x 12, 13, 14, 15
+
+ xxlor 0, 48, 48
+ xxlor 1, 49, 49
+ xxlor 2, 50, 50
+ xxlor 3, 51, 51
+ xxlor 48, 16, 16
+ xxlor 49, 17, 17
+ xxlor 50, 18, 18
+ xxlor 51, 19, 19
+ Add_state 0
+ xxlor 48, 0, 0
+ xxlor 49, 1, 1
+ xxlor 50, 2, 2
+ xxlor 51, 3, 3
+ Write_256 0
+ addi 14, 14, 256 # offset +=256
+ addi 15, 15, -256 # len -=256
+
+ xxlor 5, 32+31, 32+31
+ xxlor 32+31, 31, 31
+ vadduwm 28, 28, 31
+ xxlor 32+31, 5, 5
+ TP_4x 16+0, 16+1, 16+2, 16+3
+ TP_4x 16+4, 16+5, 16+6, 16+7
+ TP_4x 16+8, 16+9, 16+10, 16+11
+ TP_4x 16+12, 16+13, 16+14, 16+15
+
+ xxlor 32, 16, 16
+ xxlor 33, 17, 17
+ xxlor 34, 18, 18
+ xxlor 35, 19, 19
+ Add_state 16
+ Write_256 16
+ addi 14, 14, 256 # offset +=256
+ addi 15, 15, -256 # len +=256
+
+ xxlor 32+24, 24, 24
+ xxlor 32+25, 25, 25
+ xxlor 32+30, 30, 30
+ vadduwm 30, 30, 25
+ vadduwm 31, 30, 24
+ xxlor 30, 32+30, 32+30
+ xxlor 31, 32+31, 32+31
+
+ cmpdi 15, 0
+ beq Out_loop
+
+ cmpdi 15, 512
+ blt Loop_last
+
+ mtctr 8
+ b Loop_8x
+
+Loop_last:
+ lxvw4x 48, 0, 3 # vr16, constants
+ lxvw4x 49, 17, 3 # vr17, key 1
+ lxvw4x 50, 18, 3 # vr18, key 2
+ lxvw4x 51, 19, 3 # vr19, counter, nonce
+
+ vspltisw 21, 12
+ vspltisw 23, 7
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxvw4x 32+20, 0, 11
+ lxvw4x 32+22, 17, 11
+
+ sradi 8, 7, 1
+ mtctr 8
+
+Loop_4x:
+ vspltw 0, 16, 0
+ vspltw 1, 16, 1
+ vspltw 2, 16, 2
+ vspltw 3, 16, 3
+
+ vspltw 4, 17, 0
+ vspltw 5, 17, 1
+ vspltw 6, 17, 2
+ vspltw 7, 17, 3
+ vspltw 8, 18, 0
+ vspltw 9, 18, 1
+ vspltw 10, 18, 2
+ vspltw 11, 18, 3
+ vspltw 12, 19, 0
+ vadduwm 12, 12, 30 # increase counter
+ vspltw 13, 19, 1
+ vspltw 14, 19, 2
+ vspltw 15, 19, 3
+
+.align 5
+quarter_loop:
+ QT_loop_4x
+
+ bdnz quarter_loop
+
+ vadduwm 12, 12, 30
+ TP_4x 0, 1, 2, 3
+ TP_4x 4, 5, 6, 7
+ TP_4x 8, 9, 10, 11
+ TP_4x 12, 13, 14, 15
+
+ Add_state 0
+ Write_256 0
+ addi 14, 14, 256 # offset += 256
+ addi 15, 15, -256 # len += 256
+
+ # Update state counter
+ vspltisw 25, 4
+ vadduwm 30, 30, 25
+
+ cmpdi 15, 0
+ beq Out_loop
+ cmpdi 15, 256
+ blt Out_loop
+
+ mtctr 8
+ b Loop_4x
+
+Out_loop:
+ RESTORE_REGS
+ blr
+
+Out_no_chacha:
+ li 3, 0
+ blr
+SYM_FUNC_END(chacha_p10le_8x)
+
+SYM_DATA_START_LOCAL(PERMX)
+.align 5
+permx:
+.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
+.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
+SYM_DATA_END(PERMX)
diff --git a/lib/crypto/powerpc/chacha.h b/lib/crypto/powerpc/chacha.h
new file mode 100644
index 000000000000..1df6e1ce31c4
--- /dev/null
+++ b/lib/crypto/powerpc/chacha.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha stream cipher (P10 accelerated)
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+
+#include <crypto/internal/simd.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <linux/sizes.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+
+asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+ preempt_disable();
+ enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+ disable_kernel_vsx();
+ preempt_enable();
+}
+
+static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ unsigned int l = bytes & ~0x0FF;
+
+ if (l > 0) {
+ chacha_p10le_8x(state, dst, src, l, nrounds);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state->x[12] += l / CHACHA_BLOCK_SIZE;
+ }
+
+ if (bytes > 0)
+ chacha_crypt_generic(state, dst, src, bytes, nrounds);
+}
+
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE ||
+ !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ vsx_begin();
+ chacha_p10_do_8x(state, dst, src, todo, nrounds);
+ vsx_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ static_branch_enable(&have_p10);
+}
diff --git a/lib/crypto/powerpc/curve25519-ppc64le_asm.S b/lib/crypto/powerpc/curve25519-ppc64le_asm.S
new file mode 100644
index 000000000000..06c1febe24b9
--- /dev/null
+++ b/lib/crypto/powerpc/curve25519-ppc64le_asm.S
@@ -0,0 +1,671 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://github.com/dot-asm/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the CRYPTOGAMS nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+#
+# ====================================================================
+# Written and Modified by Danny Tsen <dtsen@us.ibm.com>
+# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes
+# and x25519_cswap
+#
+# Copyright 2024- IBM Corp.
+#
+# X25519 lower-level primitives for PPC64.
+#
+
+#include <linux/linkage.h>
+
+.text
+
+.align 5
+SYM_FUNC_START(x25519_fe51_mul)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 6,0(5)
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mulld 22,7,6
+ mulhdu 23,7,6
+
+ mulld 24,8,6
+ mulhdu 25,8,6
+
+ mulld 30,11,6
+ mulhdu 31,11,6
+ ld 4,8(5)
+ mulli 11,11,19
+
+ mulld 26,9,6
+ mulhdu 27,9,6
+
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 12,11,4
+ mulhdu 21,11,4
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,7,4
+ mulhdu 21,7,4
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,10,4
+ mulhdu 21,10,4
+ ld 6,16(5)
+ mulli 10,10,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,8,4
+ mulhdu 21,8,4
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,9,4
+ mulhdu 21,9,4
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,10,6
+ mulhdu 21,10,6
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,11,6
+ mulhdu 21,11,6
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,9,6
+ mulhdu 21,9,6
+ ld 4,24(5)
+ mulli 9,9,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,7,6
+ mulhdu 21,7,6
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,8,6
+ mulhdu 21,8,6
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,9,4
+ mulhdu 21,9,4
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,10,4
+ mulhdu 21,10,4
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,8,4
+ mulhdu 21,8,4
+ ld 6,32(5)
+ mulli 8,8,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,11,4
+ mulhdu 21,11,4
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,7,4
+ mulhdu 21,7,4
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,8,6
+ mulhdu 21,8,6
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,9,6
+ mulhdu 21,9,6
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,10,6
+ mulhdu 21,10,6
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,11,6
+ mulhdu 21,11,6
+ addc 28,28,12
+ adde 29,29,21
+
+ mulld 12,7,6
+ mulhdu 21,7,6
+ addc 30,30,12
+ adde 31,31,21
+
+.Lfe51_reduce:
+ li 0,-1
+ srdi 0,0,13
+
+ srdi 12,26,51
+ and 9,26,0
+ insrdi 12,27,51,0
+ srdi 21,22,51
+ and 7,22,0
+ insrdi 21,23,51,0
+ addc 28,28,12
+ addze 29,29
+ addc 24,24,21
+ addze 25,25
+
+ srdi 12,28,51
+ and 10,28,0
+ insrdi 12,29,51,0
+ srdi 21,24,51
+ and 8,24,0
+ insrdi 21,25,51,0
+ addc 30,30,12
+ addze 31,31
+ add 9,9,21
+
+ srdi 12,30,51
+ and 11,30,0
+ insrdi 12,31,51,0
+ mulli 12,12,19
+
+ add 7,7,12
+
+ srdi 21,9,51
+ and 9,9,0
+ add 10,10,21
+
+ srdi 12,7,51
+ and 7,7,0
+ add 8,8,12
+
+ std 9,16(3)
+ std 10,24(3)
+ std 11,32(3)
+ std 7,0(3)
+ std 8,8(3)
+
+ ld 21,56(1)
+ ld 22,64(1)
+ ld 23,72(1)
+ ld 24,80(1)
+ ld 25,88(1)
+ ld 26,96(1)
+ ld 27,104(1)
+ ld 28,112(1)
+ ld 29,120(1)
+ ld 30,128(1)
+ ld 31,136(1)
+ addi 1,1,144
+ blr
+SYM_FUNC_END(x25519_fe51_mul)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_sqr)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ add 6,7,7
+ mulli 21,11,19
+
+ mulld 22,7,7
+ mulhdu 23,7,7
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+ add 6,8,8
+ mulld 12,11,21
+ mulhdu 11,11,21
+ addc 28,28,12
+ adde 29,29,11
+
+ mulli 5,10,19
+
+ mulld 12,8,8
+ mulhdu 11,8,8
+ addc 26,26,12
+ adde 27,27,11
+ mulld 12,9,6
+ mulhdu 11,9,6
+ addc 28,28,12
+ adde 29,29,11
+ mulld 12,10,6
+ mulhdu 11,10,6
+ addc 30,30,12
+ adde 31,31,11
+ mulld 12,21,6
+ mulhdu 11,21,6
+ add 6,10,10
+ addc 22,22,12
+ adde 23,23,11
+ mulld 12,10,5
+ mulhdu 10,10,5
+ addc 24,24,12
+ adde 25,25,10
+ mulld 12,6,21
+ mulhdu 10,6,21
+ add 6,9,9
+ addc 26,26,12
+ adde 27,27,10
+
+ mulld 12,9,9
+ mulhdu 10,9,9
+ addc 30,30,12
+ adde 31,31,10
+ mulld 12,5,6
+ mulhdu 10,5,6
+ addc 22,22,12
+ adde 23,23,10
+ mulld 12,21,6
+ mulhdu 10,21,6
+ addc 24,24,12
+ adde 25,25,10
+
+ b .Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_sqr)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_mul121666)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ lis 6,1
+ ori 6,6,56130
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mulld 22,7,6
+ mulhdu 23,7,6
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+
+ b .Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_mul121666)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_sqr_times)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mtctr 5
+
+.Lsqr_times_loop:
+ add 6,7,7
+ mulli 21,11,19
+
+ mulld 22,7,7
+ mulhdu 23,7,7
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+ add 6,8,8
+ mulld 12,11,21
+ mulhdu 11,11,21
+ addc 28,28,12
+ adde 29,29,11
+
+ mulli 5,10,19
+
+ mulld 12,8,8
+ mulhdu 11,8,8
+ addc 26,26,12
+ adde 27,27,11
+ mulld 12,9,6
+ mulhdu 11,9,6
+ addc 28,28,12
+ adde 29,29,11
+ mulld 12,10,6
+ mulhdu 11,10,6
+ addc 30,30,12
+ adde 31,31,11
+ mulld 12,21,6
+ mulhdu 11,21,6
+ add 6,10,10
+ addc 22,22,12
+ adde 23,23,11
+ mulld 12,10,5
+ mulhdu 10,10,5
+ addc 24,24,12
+ adde 25,25,10
+ mulld 12,6,21
+ mulhdu 10,6,21
+ add 6,9,9
+ addc 26,26,12
+ adde 27,27,10
+
+ mulld 12,9,9
+ mulhdu 10,9,9
+ addc 30,30,12
+ adde 31,31,10
+ mulld 12,5,6
+ mulhdu 10,5,6
+ addc 22,22,12
+ adde 23,23,10
+ mulld 12,21,6
+ mulhdu 10,21,6
+ addc 24,24,12
+ adde 25,25,10
+
+ # fe51_reduce
+ li 0,-1
+ srdi 0,0,13
+
+ srdi 12,26,51
+ and 9,26,0
+ insrdi 12,27,51,0
+ srdi 21,22,51
+ and 7,22,0
+ insrdi 21,23,51,0
+ addc 28,28,12
+ addze 29,29
+ addc 24,24,21
+ addze 25,25
+
+ srdi 12,28,51
+ and 10,28,0
+ insrdi 12,29,51,0
+ srdi 21,24,51
+ and 8,24,0
+ insrdi 21,25,51,0
+ addc 30,30,12
+ addze 31,31
+ add 9,9,21
+
+ srdi 12,30,51
+ and 11,30,0
+ insrdi 12,31,51,0
+ mulli 12,12,19
+
+ add 7,7,12
+
+ srdi 21,9,51
+ and 9,9,0
+ add 10,10,21
+
+ srdi 12,7,51
+ and 7,7,0
+ add 8,8,12
+
+ bdnz .Lsqr_times_loop
+
+ std 9,16(3)
+ std 10,24(3)
+ std 11,32(3)
+ std 7,0(3)
+ std 8,8(3)
+
+ ld 21,56(1)
+ ld 22,64(1)
+ ld 23,72(1)
+ ld 24,80(1)
+ ld 25,88(1)
+ ld 26,96(1)
+ ld 27,104(1)
+ ld 28,112(1)
+ ld 29,120(1)
+ ld 30,128(1)
+ ld 31,136(1)
+ addi 1,1,144
+ blr
+SYM_FUNC_END(x25519_fe51_sqr_times)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_frombytes)
+
+ li 12, -1
+ srdi 12, 12, 13 # 0x7ffffffffffff
+
+ ld 5, 0(4)
+ ld 6, 8(4)
+ ld 7, 16(4)
+ ld 8, 24(4)
+
+ srdi 10, 5, 51
+ and 5, 5, 12 # h0
+
+ sldi 11, 6, 13
+ or 11, 10, 11 # h1t
+ srdi 10, 6, 38
+ and 6, 11, 12 # h1
+
+ sldi 11, 7, 26
+ or 10, 10, 11 # h2t
+
+ srdi 11, 7, 25
+ and 7, 10, 12 # h2
+ sldi 10, 8, 39
+ or 11, 11, 10 # h3t
+
+ srdi 9, 8, 12
+ and 8, 11, 12 # h3
+ and 9, 9, 12 # h4
+
+ std 5, 0(3)
+ std 6, 8(3)
+ std 7, 16(3)
+ std 8, 24(3)
+ std 9, 32(3)
+
+ blr
+SYM_FUNC_END(x25519_fe51_frombytes)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_tobytes)
+
+ ld 5, 0(4)
+ ld 6, 8(4)
+ ld 7, 16(4)
+ ld 8, 24(4)
+ ld 9, 32(4)
+
+ li 12, -1
+ srdi 12, 12, 13 # 0x7ffffffffffff
+
+ # Full reducuction
+ addi 10, 5, 19
+ srdi 10, 10, 51
+ add 10, 10, 6
+ srdi 10, 10, 51
+ add 10, 10, 7
+ srdi 10, 10, 51
+ add 10, 10, 8
+ srdi 10, 10, 51
+ add 10, 10, 9
+ srdi 10, 10, 51
+
+ mulli 10, 10, 19
+ add 5, 5, 10
+ srdi 11, 5, 51
+ add 6, 6, 11
+ srdi 11, 6, 51
+ add 7, 7, 11
+ srdi 11, 7, 51
+ add 8, 8, 11
+ srdi 11, 8, 51
+ add 9, 9, 11
+
+ and 5, 5, 12
+ and 6, 6, 12
+ and 7, 7, 12
+ and 8, 8, 12
+ and 9, 9, 12
+
+ sldi 10, 6, 51
+ or 5, 5, 10 # s0
+
+ srdi 11, 6, 13
+ sldi 10, 7, 38
+ or 6, 11, 10 # s1
+
+ srdi 11, 7, 26
+ sldi 10, 8, 25
+ or 7, 11, 10 # s2
+
+ srdi 11, 8, 39
+ sldi 10, 9, 12
+ or 8, 11, 10 # s4
+
+ std 5, 0(3)
+ std 6, 8(3)
+ std 7, 16(3)
+ std 8, 24(3)
+
+ blr
+SYM_FUNC_END(x25519_fe51_tobytes)
+
+.align 5
+SYM_FUNC_START(x25519_cswap)
+
+ li 7, 5
+ neg 6, 5
+ mtctr 7
+
+.Lswap_loop:
+ ld 8, 0(3)
+ ld 9, 0(4)
+ xor 10, 8, 9
+ and 10, 10, 6
+ xor 11, 8, 10
+ xor 12, 9, 10
+ std 11, 0(3)
+ addi 3, 3, 8
+ std 12, 0(4)
+ addi 4, 4, 8
+ bdnz .Lswap_loop
+
+ blr
+SYM_FUNC_END(x25519_cswap)
diff --git a/lib/crypto/powerpc/curve25519.h b/lib/crypto/powerpc/curve25519.h
new file mode 100644
index 000000000000..dee6234c48e9
--- /dev/null
+++ b/lib/crypto/powerpc/curve25519.h
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2024- IBM Corp.
+ *
+ * X25519 scalar multiplication with 51 bits limbs for PPC64le.
+ * Based on RFC7748 and AArch64 optimized implementation for X25519
+ * - Algorithm 1 Scalar multiplication of a variable point
+ */
+
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <linux/cpufeature.h>
+#include <linux/processor.h>
+
+typedef uint64_t fe51[5];
+
+asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f);
+asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f);
+asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n);
+asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s);
+asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h);
+asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit);
+
+#define fmul x25519_fe51_mul
+#define fsqr x25519_fe51_sqr
+#define fmul121666 x25519_fe51_mul121666
+#define fe51_tobytes x25519_fe51_tobytes
+
+static void fadd(fe51 h, const fe51 f, const fe51 g)
+{
+ h[0] = f[0] + g[0];
+ h[1] = f[1] + g[1];
+ h[2] = f[2] + g[2];
+ h[3] = f[3] + g[3];
+ h[4] = f[4] + g[4];
+}
+
+/*
+ * Prime = 2 ** 255 - 19, 255 bits
+ * (0x7fffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed)
+ *
+ * Prime in 5 51-bit limbs
+ */
+static fe51 prime51 = { 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff};
+
+static void fsub(fe51 h, const fe51 f, const fe51 g)
+{
+ h[0] = (f[0] + ((prime51[0] * 2))) - g[0];
+ h[1] = (f[1] + ((prime51[1] * 2))) - g[1];
+ h[2] = (f[2] + ((prime51[2] * 2))) - g[2];
+ h[3] = (f[3] + ((prime51[3] * 2))) - g[3];
+ h[4] = (f[4] + ((prime51[4] * 2))) - g[4];
+}
+
+static void fe51_frombytes(fe51 h, const uint8_t *s)
+{
+ /*
+ * Make sure 64-bit aligned.
+ */
+ unsigned char sbuf[32+8];
+ unsigned char *sb = PTR_ALIGN((void *)sbuf, 8);
+
+ memcpy(sb, s, 32);
+ x25519_fe51_frombytes(h, sb);
+}
+
+static void finv(fe51 o, const fe51 i)
+{
+ fe51 a0, b, c, t00;
+
+ fsqr(a0, i);
+ x25519_fe51_sqr_times(t00, a0, 2);
+
+ fmul(b, t00, i);
+ fmul(a0, b, a0);
+
+ fsqr(t00, a0);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 5);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 10);
+
+ fmul(c, t00, b);
+ x25519_fe51_sqr_times(t00, c, 20);
+
+ fmul(t00, t00, c);
+ x25519_fe51_sqr_times(t00, t00, 10);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 50);
+
+ fmul(c, t00, b);
+ x25519_fe51_sqr_times(t00, c, 100);
+
+ fmul(t00, t00, c);
+ x25519_fe51_sqr_times(t00, t00, 50);
+
+ fmul(t00, t00, b);
+ x25519_fe51_sqr_times(t00, t00, 5);
+
+ fmul(o, t00, a0);
+}
+
+static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32],
+ const uint8_t point[32])
+{
+ fe51 x1, x2, z2, x3, z3;
+ uint8_t s[32];
+ unsigned int swap = 0;
+ int i;
+
+ memcpy(s, scalar, 32);
+ s[0] &= 0xf8;
+ s[31] &= 0x7f;
+ s[31] |= 0x40;
+ fe51_frombytes(x1, point);
+
+ z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0;
+ x3[0] = x1[0];
+ x3[1] = x1[1];
+ x3[2] = x1[2];
+ x3[3] = x1[3];
+ x3[4] = x1[4];
+
+ x2[0] = z3[0] = 1;
+ x2[1] = z3[1] = 0;
+ x2[2] = z3[2] = 0;
+ x2[3] = z3[3] = 0;
+ x2[4] = z3[4] = 0;
+
+ for (i = 254; i >= 0; --i) {
+ unsigned int k_t = 1 & (s[i / 8] >> (i & 7));
+ fe51 a, b, c, d, e;
+ fe51 da, cb, aa, bb;
+ fe51 dacb_p, dacb_m;
+
+ swap ^= k_t;
+ x25519_cswap(x2, x3, swap);
+ x25519_cswap(z2, z3, swap);
+ swap = k_t;
+
+ fsub(b, x2, z2); // B = x_2 - z_2
+ fadd(a, x2, z2); // A = x_2 + z_2
+ fsub(d, x3, z3); // D = x_3 - z_3
+ fadd(c, x3, z3); // C = x_3 + z_3
+
+ fsqr(bb, b); // BB = B^2
+ fsqr(aa, a); // AA = A^2
+ fmul(da, d, a); // DA = D * A
+ fmul(cb, c, b); // CB = C * B
+
+ fsub(e, aa, bb); // E = AA - BB
+ fmul(x2, aa, bb); // x2 = AA * BB
+ fadd(dacb_p, da, cb); // DA + CB
+ fsub(dacb_m, da, cb); // DA - CB
+
+ fmul121666(z3, e); // 121666 * E
+ fsqr(z2, dacb_m); // (DA - CB)^2
+ fsqr(x3, dacb_p); // x3 = (DA + CB)^2
+ fadd(b, bb, z3); // BB + 121666 * E
+ fmul(z3, x1, z2); // z3 = x1 * (DA - CB)^2
+ fmul(z2, e, b); // z2 = e * (BB + (DA + CB)^2)
+ }
+
+ finv(z2, z2);
+ fmul(x2, x2, z2);
+ fe51_tobytes(out, x2);
+}
+
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+ curve25519_fe51(mypublic, secret, basepoint);
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ curve25519_fe51(pub, secret, curve25519_base_point);
+}
diff --git a/lib/crypto/powerpc/md5-asm.S b/lib/crypto/powerpc/md5-asm.S
new file mode 100644
index 000000000000..fa6bc440cf4a
--- /dev/null
+++ b/lib/crypto/powerpc/md5-asm.S
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast MD5 implementation for PPC
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#define rHP r3
+#define rWP r4
+
+#define rH0 r0
+#define rH1 r6
+#define rH2 r7
+#define rH3 r5
+
+#define rW00 r8
+#define rW01 r9
+#define rW02 r10
+#define rW03 r11
+#define rW04 r12
+#define rW05 r14
+#define rW06 r15
+#define rW07 r16
+#define rW08 r17
+#define rW09 r18
+#define rW10 r19
+#define rW11 r20
+#define rW12 r21
+#define rW13 r22
+#define rW14 r23
+#define rW15 r24
+
+#define rT0 r25
+#define rT1 r26
+
+#define INITIALIZE \
+ PPC_STLU r1,-INT_FRAME_SIZE(r1); \
+ SAVE_GPRS(14, 26, r1) /* push registers onto stack */
+
+#define FINALIZE \
+ REST_GPRS(14, 26, r1); /* pop registers from stack */ \
+ addi r1,r1,INT_FRAME_SIZE
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+ lwbrx reg,0,rWP; /* load data */
+#define INC_PTR \
+ addi rWP,rWP,4; /* increment per word */
+#define NEXT_BLOCK /* nothing to do */
+#else
+#define LOAD_DATA(reg, off) \
+ lwz reg,off(rWP); /* load data */
+#define INC_PTR /* nothing to do */
+#define NEXT_BLOCK \
+ addi rWP,rWP,64; /* increment per block */
+#endif
+
+#define R_00_15(a, b, c, d, w0, w1, p, q, off, k0h, k0l, k1h, k1l) \
+ LOAD_DATA(w0, off) /* W */ \
+ and rT0,b,c; /* 1: f = b and c */ \
+ INC_PTR /* ptr++ */ \
+ andc rT1,d,b; /* 1: f' = ~b and d */ \
+ LOAD_DATA(w1, off+4) /* W */ \
+ or rT0,rT0,rT1; /* 1: f = f or f' */ \
+ addi w0,w0,k0l; /* 1: wk = w + k */ \
+ add a,a,rT0; /* 1: a = a + f */ \
+ addis w0,w0,k0h; /* 1: wk = w + k' */ \
+ addis w1,w1,k1h; /* 2: wk = w + k */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addi w1,w1,k1l; /* 2: wk = w + k' */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ add a,a,b; /* 1: a = a + b */ \
+ and rT0,a,b; /* 2: f = b and c */ \
+ andc rT1,c,a; /* 2: f' = ~b and d */ \
+ or rT0,rT0,rT1; /* 2: f = f or f' */ \
+ add d,d,rT0; /* 2: a = a + f */ \
+ INC_PTR /* ptr++ */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a + b */
+
+#define R_16_31(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+ andc rT0,c,d; /* 1: f = c and ~d */ \
+ and rT1,b,d; /* 1: f' = b and d */ \
+ addi w0,w0,k0l; /* 1: wk = w + k */ \
+ or rT0,rT0,rT1; /* 1: f = f or f' */ \
+ addis w0,w0,k0h; /* 1: wk = w + k' */ \
+ add a,a,rT0; /* 1: a = a + f */ \
+ addi w1,w1,k1l; /* 2: wk = w + k */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addis w1,w1,k1h; /* 2: wk = w + k' */ \
+ andc rT0,b,c; /* 2: f = c and ~d */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add a,a,b; /* 1: a = a + b */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ and rT1,a,c; /* 2: f' = b and d */ \
+ or rT0,rT0,rT1; /* 2: f = f or f' */ \
+ add d,d,rT0; /* 2: a = a + f */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a +b */
+
+#define R_32_47(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+ xor rT0,b,c; /* 1: f' = b xor c */ \
+ addi w0,w0,k0l; /* 1: wk = w + k */ \
+ xor rT1,rT0,d; /* 1: f = f xor f' */ \
+ addis w0,w0,k0h; /* 1: wk = w + k' */ \
+ add a,a,rT1; /* 1: a = a + f */ \
+ addi w1,w1,k1l; /* 2: wk = w + k */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addis w1,w1,k1h; /* 2: wk = w + k' */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ add a,a,b; /* 1: a = a + b */ \
+ xor rT1,rT0,a; /* 2: f = b xor f' */ \
+ add d,d,rT1; /* 2: a = a + f */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a + b */
+
+#define R_48_63(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+ addi w0,w0,k0l; /* 1: w = w + k */ \
+ orc rT0,b,d; /* 1: f = b or ~d */ \
+ addis w0,w0,k0h; /* 1: w = w + k' */ \
+ xor rT0,rT0,c; /* 1: f = f xor c */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addi w1,w1,k1l; /* 2: w = w + k */ \
+ add a,a,rT0; /* 1: a = a + f */ \
+ addis w1,w1,k1h; /* 2: w = w + k' */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add a,a,b; /* 1: a = a + b */ \
+ orc rT0,a,c; /* 2: f = b or ~d */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ xor rT0,rT0,b; /* 2: f = f xor c */ \
+ add d,d,rT0; /* 2: a = a + f */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a + b */
+
+_GLOBAL(ppc_md5_transform)
+ INITIALIZE
+
+ mtctr r5
+ lwz rH0,0(rHP)
+ lwz rH1,4(rHP)
+ lwz rH2,8(rHP)
+ lwz rH3,12(rHP)
+
+ppc_md5_main:
+ R_00_15(rH0, rH1, rH2, rH3, rW00, rW01, 25, 20, 0,
+ 0xd76b, -23432, 0xe8c8, -18602)
+ R_00_15(rH2, rH3, rH0, rH1, rW02, rW03, 15, 10, 8,
+ 0x2420, 0x70db, 0xc1be, -12562)
+ R_00_15(rH0, rH1, rH2, rH3, rW04, rW05, 25, 20, 16,
+ 0xf57c, 0x0faf, 0x4788, -14806)
+ R_00_15(rH2, rH3, rH0, rH1, rW06, rW07, 15, 10, 24,
+ 0xa830, 0x4613, 0xfd47, -27391)
+ R_00_15(rH0, rH1, rH2, rH3, rW08, rW09, 25, 20, 32,
+ 0x6981, -26408, 0x8b45, -2129)
+ R_00_15(rH2, rH3, rH0, rH1, rW10, rW11, 15, 10, 40,
+ 0xffff, 0x5bb1, 0x895d, -10306)
+ R_00_15(rH0, rH1, rH2, rH3, rW12, rW13, 25, 20, 48,
+ 0x6b90, 0x1122, 0xfd98, 0x7193)
+ R_00_15(rH2, rH3, rH0, rH1, rW14, rW15, 15, 10, 56,
+ 0xa679, 0x438e, 0x49b4, 0x0821)
+
+ R_16_31(rH0, rH1, rH2, rH3, rW01, rW06, 27, 23,
+ 0x0d56, 0x6e0c, 0x1810, 0x6d2d)
+ R_16_31(rH2, rH3, rH0, rH1, rW11, rW00, 18, 12,
+ 0x9d02, -32109, 0x124c, 0x2332)
+ R_16_31(rH0, rH1, rH2, rH3, rW05, rW10, 27, 23,
+ 0x8ea7, 0x4a33, 0x0245, -18270)
+ R_16_31(rH2, rH3, rH0, rH1, rW15, rW04, 18, 12,
+ 0x8eee, -8608, 0xf258, -5095)
+ R_16_31(rH0, rH1, rH2, rH3, rW09, rW14, 27, 23,
+ 0x969d, -10697, 0x1cbe, -15288)
+ R_16_31(rH2, rH3, rH0, rH1, rW03, rW08, 18, 12,
+ 0x3317, 0x3e99, 0xdbd9, 0x7c15)
+ R_16_31(rH0, rH1, rH2, rH3, rW13, rW02, 27, 23,
+ 0xac4b, 0x7772, 0xd8cf, 0x331d)
+ R_16_31(rH2, rH3, rH0, rH1, rW07, rW12, 18, 12,
+ 0x6a28, 0x6dd8, 0x219a, 0x3b68)
+
+ R_32_47(rH0, rH1, rH2, rH3, rW05, rW08, 28, 21,
+ 0x29cb, 0x28e5, 0x4218, -7788)
+ R_32_47(rH2, rH3, rH0, rH1, rW11, rW14, 16, 9,
+ 0x473f, 0x06d1, 0x3aae, 0x3036)
+ R_32_47(rH0, rH1, rH2, rH3, rW01, rW04, 28, 21,
+ 0xaea1, -15134, 0x640b, -11295)
+ R_32_47(rH2, rH3, rH0, rH1, rW07, rW10, 16, 9,
+ 0x8f4c, 0x4887, 0xbc7c, -22499)
+ R_32_47(rH0, rH1, rH2, rH3, rW13, rW00, 28, 21,
+ 0x7eb8, -27199, 0x00ea, 0x6050)
+ R_32_47(rH2, rH3, rH0, rH1, rW03, rW06, 16, 9,
+ 0xe01a, 0x22fe, 0x4447, 0x69c5)
+ R_32_47(rH0, rH1, rH2, rH3, rW09, rW12, 28, 21,
+ 0xb7f3, 0x0253, 0x59b1, 0x4d5b)
+ R_32_47(rH2, rH3, rH0, rH1, rW15, rW02, 16, 9,
+ 0x4701, -27017, 0xc7bd, -19859)
+
+ R_48_63(rH0, rH1, rH2, rH3, rW00, rW07, 26, 22,
+ 0x0988, -1462, 0x4c70, -19401)
+ R_48_63(rH2, rH3, rH0, rH1, rW14, rW05, 17, 11,
+ 0xadaf, -5221, 0xfc99, 0x66f7)
+ R_48_63(rH0, rH1, rH2, rH3, rW12, rW03, 26, 22,
+ 0x7e80, -16418, 0xba1e, -25587)
+ R_48_63(rH2, rH3, rH0, rH1, rW10, rW01, 17, 11,
+ 0x4130, 0x380d, 0xe0c5, 0x738d)
+ lwz rW00,0(rHP)
+ R_48_63(rH0, rH1, rH2, rH3, rW08, rW15, 26, 22,
+ 0xe837, -30770, 0xde8a, 0x69e8)
+ lwz rW14,4(rHP)
+ R_48_63(rH2, rH3, rH0, rH1, rW06, rW13, 17, 11,
+ 0x9e79, 0x260f, 0x256d, -27941)
+ lwz rW12,8(rHP)
+ R_48_63(rH0, rH1, rH2, rH3, rW04, rW11, 26, 22,
+ 0xab75, -20775, 0x4f9e, -28397)
+ lwz rW10,12(rHP)
+ R_48_63(rH2, rH3, rH0, rH1, rW02, rW09, 17, 11,
+ 0x662b, 0x7c56, 0x11b2, 0x0358)
+
+ add rH0,rH0,rW00
+ stw rH0,0(rHP)
+ add rH1,rH1,rW14
+ stw rH1,4(rHP)
+ add rH2,rH2,rW12
+ stw rH2,8(rHP)
+ add rH3,rH3,rW10
+ stw rH3,12(rHP)
+ NEXT_BLOCK
+
+ bdnz ppc_md5_main
+
+ FINALIZE
+ blr
diff --git a/lib/crypto/powerpc/md5.h b/lib/crypto/powerpc/md5.h
new file mode 100644
index 000000000000..540b08e34d1d
--- /dev/null
+++ b/lib/crypto/powerpc/md5.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * MD5 optimized for PowerPC
+ */
+
+void ppc_md5_transform(u32 *state, const u8 *data, size_t nblocks);
+
+static void md5_blocks(struct md5_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ ppc_md5_transform(state->h, data, nblocks);
+}
diff --git a/lib/crypto/powerpc/poly1305-p10le_64.S b/lib/crypto/powerpc/poly1305-p10le_64.S
new file mode 100644
index 000000000000..a3c1987f1ecd
--- /dev/null
+++ b/lib/crypto/powerpc/poly1305-p10le_64.S
@@ -0,0 +1,1075 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated poly1305 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# Poly1305 - this version mainly using vector/VSX/Scalar
+# - 26 bits limbs
+# - Handle multiple 64 byte blcok.
+#
+# Block size 16 bytes
+# key = (r, s)
+# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
+# p = 2^130 - 5
+# a += m
+# a = (r + a) % p
+# a += s
+#
+# Improve performance by breaking down polynominal to the sum of products with
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+#
+# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
+# to 9 vectors for multiplications.
+#
+# setup r^4, r^3, r^2, r vectors
+# vs [r^1, r^3, r^2, r^4]
+# vs0 = [r0,.....]
+# vs1 = [r1,.....]
+# vs2 = [r2,.....]
+# vs3 = [r3,.....]
+# vs4 = [r4,.....]
+# vs5 = [r1*5,...]
+# vs6 = [r2*5,...]
+# vs7 = [r2*5,...]
+# vs8 = [r4*5,...]
+#
+# Each word in a vector consists a member of a "r/s" in [a * r/s].
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0, r4*5, r3*5, r2*5;
+# r2, r1, r0, r4*5, r3*5;
+# r3, r2, r1, r0, r4*5;
+# r4, r3, r2, r1, r0 ;
+#
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+# k = 32 bytes key
+# r3 = k (r, s)
+# r4 = mlen
+# r5 = m
+#
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine "any"
+
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+ std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ stvx \VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ stxvx \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+ ld \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ lxvx \VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-752(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ addi 9, 1, 256
+ SAVE_VRS 20, 0, 9
+ SAVE_VRS 21, 16, 9
+ SAVE_VRS 22, 32, 9
+ SAVE_VRS 23, 48, 9
+ SAVE_VRS 24, 64, 9
+ SAVE_VRS 25, 80, 9
+ SAVE_VRS 26, 96, 9
+ SAVE_VRS 27, 112, 9
+ SAVE_VRS 28, 128, 9
+ SAVE_VRS 29, 144, 9
+ SAVE_VRS 30, 160, 9
+ SAVE_VRS 31, 176, 9
+
+ SAVE_VSX 14, 192, 9
+ SAVE_VSX 15, 208, 9
+ SAVE_VSX 16, 224, 9
+ SAVE_VSX 17, 240, 9
+ SAVE_VSX 18, 256, 9
+ SAVE_VSX 19, 272, 9
+ SAVE_VSX 20, 288, 9
+ SAVE_VSX 21, 304, 9
+ SAVE_VSX 22, 320, 9
+ SAVE_VSX 23, 336, 9
+ SAVE_VSX 24, 352, 9
+ SAVE_VSX 25, 368, 9
+ SAVE_VSX 26, 384, 9
+ SAVE_VSX 27, 400, 9
+ SAVE_VSX 28, 416, 9
+ SAVE_VSX 29, 432, 9
+ SAVE_VSX 30, 448, 9
+ SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+ addi 9, 1, 256
+ RESTORE_VRS 20, 0, 9
+ RESTORE_VRS 21, 16, 9
+ RESTORE_VRS 22, 32, 9
+ RESTORE_VRS 23, 48, 9
+ RESTORE_VRS 24, 64, 9
+ RESTORE_VRS 25, 80, 9
+ RESTORE_VRS 26, 96, 9
+ RESTORE_VRS 27, 112, 9
+ RESTORE_VRS 28, 128, 9
+ RESTORE_VRS 29, 144, 9
+ RESTORE_VRS 30, 160, 9
+ RESTORE_VRS 31, 176, 9
+
+ RESTORE_VSX 14, 192, 9
+ RESTORE_VSX 15, 208, 9
+ RESTORE_VSX 16, 224, 9
+ RESTORE_VSX 17, 240, 9
+ RESTORE_VSX 18, 256, 9
+ RESTORE_VSX 19, 272, 9
+ RESTORE_VSX 20, 288, 9
+ RESTORE_VSX 21, 304, 9
+ RESTORE_VSX 22, 320, 9
+ RESTORE_VSX 23, 336, 9
+ RESTORE_VSX 24, 352, 9
+ RESTORE_VSX 25, 368, 9
+ RESTORE_VSX 26, 384, 9
+ RESTORE_VSX 27, 400, 9
+ RESTORE_VSX 28, 416, 9
+ RESTORE_VSX 29, 432, 9
+ RESTORE_VSX 30, 448, 9
+ RESTORE_VSX 31, 464, 9
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 752
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+#
+# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
+# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
+# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
+# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
+# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
+#
+# [r^2, r^3, r^1, r^4]
+# [m3, m2, m4, m1]
+#
+# multiply odd and even words
+.macro mul_odd
+ vmulouw 14, 4, 26
+ vmulouw 10, 5, 3
+ vmulouw 11, 6, 2
+ vmulouw 12, 7, 1
+ vmulouw 13, 8, 0
+ vmulouw 15, 4, 27
+ vaddudm 14, 14, 10
+ vaddudm 14, 14, 11
+ vmulouw 10, 5, 26
+ vmulouw 11, 6, 3
+ vaddudm 14, 14, 12
+ vaddudm 14, 14, 13 # x0
+ vaddudm 15, 15, 10
+ vaddudm 15, 15, 11
+ vmulouw 12, 7, 2
+ vmulouw 13, 8, 1
+ vaddudm 15, 15, 12
+ vaddudm 15, 15, 13 # x1
+ vmulouw 16, 4, 28
+ vmulouw 10, 5, 27
+ vmulouw 11, 6, 26
+ vaddudm 16, 16, 10
+ vaddudm 16, 16, 11
+ vmulouw 12, 7, 3
+ vmulouw 13, 8, 2
+ vaddudm 16, 16, 12
+ vaddudm 16, 16, 13 # x2
+ vmulouw 17, 4, 29
+ vmulouw 10, 5, 28
+ vmulouw 11, 6, 27
+ vaddudm 17, 17, 10
+ vaddudm 17, 17, 11
+ vmulouw 12, 7, 26
+ vmulouw 13, 8, 3
+ vaddudm 17, 17, 12
+ vaddudm 17, 17, 13 # x3
+ vmulouw 18, 4, 30
+ vmulouw 10, 5, 29
+ vmulouw 11, 6, 28
+ vaddudm 18, 18, 10
+ vaddudm 18, 18, 11
+ vmulouw 12, 7, 27
+ vmulouw 13, 8, 26
+ vaddudm 18, 18, 12
+ vaddudm 18, 18, 13 # x4
+.endm
+
+.macro mul_even
+ vmuleuw 9, 4, 26
+ vmuleuw 10, 5, 3
+ vmuleuw 11, 6, 2
+ vmuleuw 12, 7, 1
+ vmuleuw 13, 8, 0
+ vaddudm 14, 14, 9
+ vaddudm 14, 14, 10
+ vaddudm 14, 14, 11
+ vaddudm 14, 14, 12
+ vaddudm 14, 14, 13 # x0
+
+ vmuleuw 9, 4, 27
+ vmuleuw 10, 5, 26
+ vmuleuw 11, 6, 3
+ vmuleuw 12, 7, 2
+ vmuleuw 13, 8, 1
+ vaddudm 15, 15, 9
+ vaddudm 15, 15, 10
+ vaddudm 15, 15, 11
+ vaddudm 15, 15, 12
+ vaddudm 15, 15, 13 # x1
+
+ vmuleuw 9, 4, 28
+ vmuleuw 10, 5, 27
+ vmuleuw 11, 6, 26
+ vmuleuw 12, 7, 3
+ vmuleuw 13, 8, 2
+ vaddudm 16, 16, 9
+ vaddudm 16, 16, 10
+ vaddudm 16, 16, 11
+ vaddudm 16, 16, 12
+ vaddudm 16, 16, 13 # x2
+
+ vmuleuw 9, 4, 29
+ vmuleuw 10, 5, 28
+ vmuleuw 11, 6, 27
+ vmuleuw 12, 7, 26
+ vmuleuw 13, 8, 3
+ vaddudm 17, 17, 9
+ vaddudm 17, 17, 10
+ vaddudm 17, 17, 11
+ vaddudm 17, 17, 12
+ vaddudm 17, 17, 13 # x3
+
+ vmuleuw 9, 4, 30
+ vmuleuw 10, 5, 29
+ vmuleuw 11, 6, 28
+ vmuleuw 12, 7, 27
+ vmuleuw 13, 8, 26
+ vaddudm 18, 18, 9
+ vaddudm 18, 18, 10
+ vaddudm 18, 18, 11
+ vaddudm 18, 18, 12
+ vaddudm 18, 18, 13 # x4
+.endm
+
+#
+# poly1305_setup_r
+#
+# setup r^4, r^3, r^2, r vectors
+# [r, r^3, r^2, r^4]
+# vs0 = [r0,...]
+# vs1 = [r1,...]
+# vs2 = [r2,...]
+# vs3 = [r3,...]
+# vs4 = [r4,...]
+# vs5 = [r4*5,...]
+# vs6 = [r3*5,...]
+# vs7 = [r2*5,...]
+# vs8 = [r1*5,...]
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0, r4*5, r3*5, r2*5;
+# r2, r1, r0, r4*5, r3*5;
+# r3, r2, r1, r0, r4*5;
+# r4, r3, r2, r1, r0 ;
+#
+.macro poly1305_setup_r
+
+ # save r
+ xxlor 26, 58, 58
+ xxlor 27, 59, 59
+ xxlor 28, 60, 60
+ xxlor 29, 61, 61
+ xxlor 30, 62, 62
+
+ xxlxor 31, 31, 31
+
+# [r, r^3, r^2, r^4]
+ # compute r^2
+ vmr 4, 26
+ vmr 5, 27
+ vmr 6, 28
+ vmr 7, 29
+ vmr 8, 30
+ bl do_mul # r^2 r^1
+ xxpermdi 58, 58, 36, 0x3 # r0
+ xxpermdi 59, 59, 37, 0x3 # r1
+ xxpermdi 60, 60, 38, 0x3 # r2
+ xxpermdi 61, 61, 39, 0x3 # r3
+ xxpermdi 62, 62, 40, 0x3 # r4
+ xxpermdi 36, 36, 36, 0x3
+ xxpermdi 37, 37, 37, 0x3
+ xxpermdi 38, 38, 38, 0x3
+ xxpermdi 39, 39, 39, 0x3
+ xxpermdi 40, 40, 40, 0x3
+ vspltisb 13, 2
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+
+ bl do_mul # r^4 r^3
+ vmrgow 26, 26, 4
+ vmrgow 27, 27, 5
+ vmrgow 28, 28, 6
+ vmrgow 29, 29, 7
+ vmrgow 30, 30, 8
+ vspltisb 13, 2
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+
+ # r^2 r^4
+ xxlor 0, 58, 58
+ xxlor 1, 59, 59
+ xxlor 2, 60, 60
+ xxlor 3, 61, 61
+ xxlor 4, 62, 62
+ xxlor 5, 32, 32
+ xxlor 6, 33, 33
+ xxlor 7, 34, 34
+ xxlor 8, 35, 35
+
+ vspltw 9, 26, 3
+ vspltw 10, 26, 2
+ vmrgow 26, 10, 9
+ vspltw 9, 27, 3
+ vspltw 10, 27, 2
+ vmrgow 27, 10, 9
+ vspltw 9, 28, 3
+ vspltw 10, 28, 2
+ vmrgow 28, 10, 9
+ vspltw 9, 29, 3
+ vspltw 10, 29, 2
+ vmrgow 29, 10, 9
+ vspltw 9, 30, 3
+ vspltw 10, 30, 2
+ vmrgow 30, 10, 9
+
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+.endm
+
+SYM_FUNC_START_LOCAL(do_mul)
+ mul_odd
+
+ # do reduction ( h %= p )
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 14, 31
+ vsrd 11, 17, 31
+ vand 7, 17, 25
+ vand 4, 14, 25
+ vaddudm 18, 18, 11
+ vsrd 12, 18, 31
+ vaddudm 15, 15, 10
+
+ vsrd 11, 15, 31
+ vand 8, 18, 25
+ vand 5, 15, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 16, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vaddudm 8, 8, 11
+ blr
+SYM_FUNC_END(do_mul)
+
+#
+# init key
+#
+.macro do_poly1305_init
+ addis 10, 2, rmask@toc@ha
+ addi 10, 10, rmask@toc@l
+
+ ld 11, 0(10)
+ ld 12, 8(10)
+
+ li 14, 16
+ li 15, 32
+ addis 10, 2, cnum@toc@ha
+ addi 10, 10, cnum@toc@l
+ lvx 25, 0, 10 # v25 - mask
+ lvx 31, 14, 10 # v31 = 1a
+ lvx 19, 15, 10 # v19 = 1 << 24
+ lxv 24, 48(10) # vs24
+ lxv 25, 64(10) # vs25
+
+ # initialize
+ # load key from r3 to vectors
+ ld 9, 24(3)
+ ld 10, 32(3)
+ and. 9, 9, 11
+ and. 10, 10, 12
+
+ # break 26 bits
+ extrdi 14, 9, 26, 38
+ extrdi 15, 9, 26, 12
+ extrdi 16, 9, 12, 0
+ mtvsrdd 58, 0, 14
+ insrdi 16, 10, 14, 38
+ mtvsrdd 59, 0, 15
+ extrdi 17, 10, 26, 24
+ mtvsrdd 60, 0, 16
+ extrdi 18, 10, 24, 0
+ mtvsrdd 61, 0, 17
+ mtvsrdd 62, 0, 18
+
+ # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
+ li 9, 5
+ mtvsrdd 36, 0, 9
+ vmulouw 0, 27, 4 # v0 = rr0
+ vmulouw 1, 28, 4 # v1 = rr1
+ vmulouw 2, 29, 4 # v2 = rr2
+ vmulouw 3, 30, 4 # v3 = rr3
+.endm
+
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+# k = 32 bytes key
+# r3 = k (r, s)
+# r4 = mlen
+# r5 = m
+#
+SYM_FUNC_START(poly1305_p10le_4blocks)
+.align 5
+ cmpdi 5, 64
+ blt Out_no_poly1305
+
+ SAVE_REGS
+
+ do_poly1305_init
+
+ li 21, 0 # counter to message
+
+ poly1305_setup_r
+
+ # load previous H state
+ # break/convert r6 to 26 bits
+ ld 9, 0(3)
+ ld 10, 8(3)
+ ld 19, 16(3)
+ sldi 19, 19, 24
+ mtvsrdd 41, 0, 19
+ extrdi 14, 9, 26, 38
+ extrdi 15, 9, 26, 12
+ extrdi 16, 9, 12, 0
+ mtvsrdd 36, 0, 14
+ insrdi 16, 10, 14, 38
+ mtvsrdd 37, 0, 15
+ extrdi 17, 10, 26, 24
+ mtvsrdd 38, 0, 16
+ extrdi 18, 10, 24, 0
+ mtvsrdd 39, 0, 17
+ mtvsrdd 40, 0, 18
+ vor 8, 8, 9
+
+ # input m1 m2
+ add 20, 4, 21
+ xxlor 49, 24, 24
+ xxlor 50, 25, 25
+ lxvw4x 43, 0, 20
+ addi 17, 20, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ vand 9, 14, 25 # a0
+ vsrd 10, 14, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+ vand 10, 10, 25 # a1
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 12, 16, 13
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vspltisb 13, 14
+ vsrd 12, 15, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ vaddudm 20, 4, 9
+ vaddudm 21, 5, 10
+ vaddudm 22, 6, 11
+ vaddudm 23, 7, 12
+ vaddudm 24, 8, 13
+
+ # m3 m4
+ addi 17, 17, 16
+ lxvw4x 43, 0, 17
+ addi 17, 17, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ vand 9, 14, 25 # a0
+ vsrd 10, 14, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+ vand 10, 10, 25 # a1
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 12, 16, 13
+ vspltisb 13, 14
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vsrd 12, 15, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
+ vmrgow 4, 9, 20
+ vmrgow 5, 10, 21
+ vmrgow 6, 11, 22
+ vmrgow 7, 12, 23
+ vmrgow 8, 13, 24
+ vaddudm 8, 8, 19
+
+ addi 5, 5, -64 # len -= 64
+ addi 21, 21, 64 # offset += 64
+
+ li 9, 64
+ divdu 31, 5, 9
+
+ cmpdi 31, 0
+ ble Skip_block_loop
+
+ mtctr 31
+
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+# Rewrite the polynominal sum of product as follows,
+# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
+# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
+# .... Repeat
+# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
+# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
+#
+loop_4blocks:
+
+ # Multiply odd words and even words
+ mul_odd
+ mul_even
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 14, 31
+ vsrd 11, 17, 31
+ vand 7, 17, 25
+ vand 4, 14, 25
+ vaddudm 18, 18, 11
+ vsrd 12, 18, 31
+ vaddudm 15, 15, 10
+
+ vsrd 11, 15, 31
+ vand 8, 18, 25
+ vand 5, 15, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 16, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vaddudm 8, 8, 11
+
+ # input m1 m2 m3 m4
+ add 20, 4, 21
+ xxlor 49, 24, 24
+ xxlor 50, 25, 25
+ lxvw4x 43, 0, 20
+ addi 17, 20, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ addi 17, 17, 16
+ lxvw4x 43, 0, 17
+ addi 17, 17, 16
+ lxvw4x 44, 0, 17
+ vperm 17, 11, 12, 17
+ vperm 18, 11, 12, 18
+
+ vand 20, 14, 25 # a0
+ vand 9, 17, 25 # a0
+ vsrd 21, 14, 31 # >> 26
+ vsrd 22, 21, 31 # 12 bits left
+ vsrd 10, 17, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+
+ vand 21, 21, 25 # a1
+ vand 10, 10, 25 # a1
+
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 23, 16, 13
+ vor 22, 22, 23
+ vand 22, 22, 25 # a2
+ vand 16, 18, 25
+ vsld 12, 16, 13
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vspltisb 13, 14
+ vsrd 23, 15, 13 # >> 14
+ vsrd 24, 23, 31 # >> 26, a4
+ vand 23, 23, 25 # a3
+ vsrd 12, 18, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ vaddudm 4, 4, 20
+ vaddudm 5, 5, 21
+ vaddudm 6, 6, 22
+ vaddudm 7, 7, 23
+ vaddudm 8, 8, 24
+
+ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
+ vmrgow 4, 9, 4
+ vmrgow 5, 10, 5
+ vmrgow 6, 11, 6
+ vmrgow 7, 12, 7
+ vmrgow 8, 13, 8
+ vaddudm 8, 8, 19
+
+ addi 5, 5, -64 # len -= 64
+ addi 21, 21, 64 # offset += 64
+
+ bdnz loop_4blocks
+
+Skip_block_loop:
+ xxlor 58, 0, 0
+ xxlor 59, 1, 1
+ xxlor 60, 2, 2
+ xxlor 61, 3, 3
+ xxlor 62, 4, 4
+ xxlor 32, 5, 5
+ xxlor 33, 6, 6
+ xxlor 34, 7, 7
+ xxlor 35, 8, 8
+
+ # Multiply odd words and even words
+ mul_odd
+ mul_even
+
+ # Sum the products.
+ xxpermdi 41, 31, 46, 0
+ xxpermdi 42, 31, 47, 0
+ vaddudm 4, 14, 9
+ xxpermdi 36, 31, 36, 3
+ vaddudm 5, 15, 10
+ xxpermdi 37, 31, 37, 3
+ xxpermdi 43, 31, 48, 0
+ vaddudm 6, 16, 11
+ xxpermdi 38, 31, 38, 3
+ xxpermdi 44, 31, 49, 0
+ vaddudm 7, 17, 12
+ xxpermdi 39, 31, 39, 3
+ xxpermdi 45, 31, 50, 0
+ vaddudm 8, 18, 13
+ xxpermdi 40, 31, 40, 3
+
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 4, 31
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 8, 8, 11
+ vsrd 12, 8, 31
+ vaddudm 5, 5, 10
+
+ vsrd 11, 5, 31
+ vand 8, 8, 25
+ vand 5, 5, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 6, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vsrd 10, 5, 31
+ vand 5, 5, 25
+ vaddudm 6, 6, 10
+ vaddudm 8, 8, 11
+
+ b do_final_update
+
+do_final_update:
+ # combine 26 bit limbs
+ # v4, v5, v6, v7 and v8 are 26 bit vectors
+ vsld 5, 5, 31
+ vor 20, 4, 5
+ vspltisb 11, 12
+ vsrd 12, 6, 11
+ vsld 6, 6, 31
+ vsld 6, 6, 31
+ vor 20, 20, 6
+ vspltisb 11, 14
+ vsld 7, 7, 11
+ vor 21, 7, 12
+ mfvsrld 16, 40 # save last 2 bytes
+ vsld 8, 8, 11
+ vsld 8, 8, 31
+ vor 21, 21, 8
+ mfvsrld 17, 52
+ mfvsrld 19, 53
+ srdi 16, 16, 24
+
+ std 17, 0(3)
+ std 19, 8(3)
+ stw 16, 16(3)
+
+Out_loop:
+ li 3, 0
+
+ RESTORE_REGS
+
+ blr
+
+Out_no_poly1305:
+ li 3, 0
+ blr
+SYM_FUNC_END(poly1305_p10le_4blocks)
+
+#
+# =======================================================================
+# The following functions implement 64 x 64 bits multiplication poly1305.
+#
+SYM_FUNC_START_LOCAL(Poly1305_init_64)
+ # mask 0x0FFFFFFC0FFFFFFC
+ # mask 0x0FFFFFFC0FFFFFFF
+ addis 10, 2, rmask@toc@ha
+ addi 10, 10, rmask@toc@l
+ ld 11, 0(10)
+ ld 12, 8(10)
+
+ # initialize
+ # load key from r3
+ ld 9, 24(3)
+ ld 10, 32(3)
+ and. 9, 9, 11 # cramp mask r0
+ and. 10, 10, 12 # cramp mask r1
+
+ srdi 21, 10, 2
+ add 19, 21, 10 # s1: r19 - (r1 >> 2) *5
+
+ # setup r and s
+ li 25, 0
+ mtvsrdd 32+0, 9, 19 # r0, s1
+ mtvsrdd 32+1, 10, 9 # r1, r0
+ mtvsrdd 32+2, 19, 25 # s1
+ mtvsrdd 32+3, 9, 25 # r0
+
+ blr
+SYM_FUNC_END(Poly1305_init_64)
+
+# Poly1305_mult
+# v6 = (h0, h1), v8 = h2
+# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
+#
+# Output: v7, v10, v11
+#
+SYM_FUNC_START_LOCAL(Poly1305_mult)
+ #
+ # d0 = h0 * r0 + h1 * s1
+ vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1
+
+ # d1 = h0 * r1 + h1 * r0 + h2 * s1
+ vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0
+ vmsumudm 10, 8, 2, 11 # d1 += h2 * s1
+
+ # d2 = r0
+ vmsumudm 11, 8, 3, 9 # d2 = h2 * r0
+ blr
+SYM_FUNC_END(Poly1305_mult)
+
+#
+# carry reduction
+# h %=p
+#
+# Input: v7, v10, v11
+# Output: r27, r28, r29
+#
+SYM_FUNC_START_LOCAL(Carry_reduction)
+ mfvsrld 27, 32+7
+ mfvsrld 28, 32+10
+ mfvsrld 29, 32+11
+ mfvsrd 20, 32+7 # h0.h
+ mfvsrd 21, 32+10 # h1.h
+
+ addc 28, 28, 20
+ adde 29, 29, 21
+ srdi 22, 29, 0x2
+ sldi 23, 22, 0x2
+ add 23, 23, 22 # (h2 & 3) * 5
+ addc 27, 27, 23 # h0
+ addze 28, 28 # h1
+ andi. 29, 29, 0x3 # h2
+ blr
+SYM_FUNC_END(Carry_reduction)
+
+#
+# poly1305 multiplication
+# h *= r, h %= p
+# d0 = h0 * r0 + h1 * s1
+# d1 = h0 * r1 + h1 * r0 + h2 * s1
+# d2 = h0 * r0
+#
+#
+# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
+# - no highbit if final leftover block (highbit = 0)
+#
+SYM_FUNC_START(poly1305_64s)
+ cmpdi 5, 0
+ ble Out_no_poly1305_64
+
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-400(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ # Init poly1305
+ bl Poly1305_init_64
+
+ li 25, 0 # offset to inp and outp
+
+ add 11, 25, 4
+
+ # load h
+ # h0, h1, h2?
+ ld 27, 0(3)
+ ld 28, 8(3)
+ lwz 29, 16(3)
+
+ li 30, 16
+ divdu 31, 5, 30
+
+ mtctr 31
+
+ mr 24, 6 # highbit
+
+Loop_block_64:
+ vxor 9, 9, 9
+
+ ld 20, 0(11)
+ ld 21, 8(11)
+ addi 11, 11, 16
+
+ addc 27, 27, 20
+ adde 28, 28, 21
+ adde 29, 29, 24
+
+ li 22, 0
+ mtvsrdd 32+6, 27, 28 # h0, h1
+ mtvsrdd 32+8, 29, 22 # h2
+
+ bl Poly1305_mult
+
+ bl Carry_reduction
+
+ bdnz Loop_block_64
+
+ std 27, 0(3)
+ std 28, 8(3)
+ stw 29, 16(3)
+
+ li 3, 0
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 400
+ ld 0, 16(1)
+ mtlr 0
+
+ blr
+
+Out_no_poly1305_64:
+ li 3, 0
+ blr
+SYM_FUNC_END(poly1305_64s)
+
+#
+# Input: r3 = h, r4 = s, r5 = mac
+# mac = h + s
+#
+SYM_FUNC_START(poly1305_emit_64)
+ ld 10, 0(3)
+ ld 11, 8(3)
+ ld 12, 16(3)
+
+ # compare modulus
+ # h + 5 + (-p)
+ mr 6, 10
+ mr 7, 11
+ mr 8, 12
+ addic. 6, 6, 5
+ addze 7, 7
+ addze 8, 8
+ srdi 9, 8, 2 # overflow?
+ cmpdi 9, 0
+ beq Skip_h64
+ mr 10, 6
+ mr 11, 7
+ mr 12, 8
+
+Skip_h64:
+ ld 6, 0(4)
+ ld 7, 8(4)
+ addc 10, 10, 6
+ adde 11, 11, 7
+ addze 12, 12
+
+ std 10, 0(5)
+ std 11, 8(5)
+ blr
+SYM_FUNC_END(poly1305_emit_64)
+
+SYM_DATA_START_LOCAL(RMASK)
+.align 5
+rmask:
+.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
+cnum:
+.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
+.long 0x1a, 0x00, 0x1a, 0x00
+.long 0x01000000, 0x01000000, 0x01000000, 0x01000000
+.long 0x00010203, 0x04050607, 0x10111213, 0x14151617
+.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
+SYM_DATA_END(RMASK)
diff --git a/lib/crypto/powerpc/poly1305.h b/lib/crypto/powerpc/poly1305.h
new file mode 100644
index 000000000000..b8ed098a0e95
--- /dev/null
+++ b/lib/crypto/powerpc/poly1305.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Poly1305 authenticator algorithm, RFC7539.
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+#include <asm/switch_to.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen);
+asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit);
+asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+ preempt_disable();
+ enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+ disable_kernel_vsx();
+ preempt_enable();
+}
+
+static void poly1305_block_init(struct poly1305_block_state *dctx,
+ const u8 raw_key[POLY1305_BLOCK_SIZE])
+{
+ if (!static_key_enabled(&have_p10))
+ return poly1305_block_init_generic(dctx, raw_key);
+
+ dctx->h = (struct poly1305_state){};
+ dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0);
+ dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8);
+}
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+ unsigned int len, u32 padbit)
+{
+ if (!static_key_enabled(&have_p10))
+ return poly1305_blocks_generic(state, src, len, padbit);
+ vsx_begin();
+ if (len >= POLY1305_BLOCK_SIZE * 4) {
+ poly1305_p10le_4blocks(state, src, len);
+ src += len - (len % (POLY1305_BLOCK_SIZE * 4));
+ len %= POLY1305_BLOCK_SIZE * 4;
+ }
+ while (len >= POLY1305_BLOCK_SIZE) {
+ poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit);
+ len -= POLY1305_BLOCK_SIZE;
+ src += POLY1305_BLOCK_SIZE;
+ }
+ vsx_end();
+}
+
+static void poly1305_emit(const struct poly1305_state *state,
+ u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4])
+{
+ if (!static_key_enabled(&have_p10))
+ return poly1305_emit_generic(state, digest, nonce);
+ poly1305_emit_64(state, nonce, digest);
+}
+
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ static_branch_enable(&have_p10);
+}
diff --git a/lib/crypto/powerpc/sha1-powerpc-asm.S b/lib/crypto/powerpc/sha1-powerpc-asm.S
new file mode 100644
index 000000000000..f0d5ed557ab1
--- /dev/null
+++ b/lib/crypto/powerpc/sha1-powerpc-asm.S
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#ifdef __BIG_ENDIAN__
+#define LWZ(rt, d, ra) \
+ lwz rt,d(ra)
+#else
+#define LWZ(rt, d, ra) \
+ li rt,d; \
+ lwbrx rt,rt,ra
+#endif
+
+/*
+ * We roll the registers for T, A, B, C, D, E around on each
+ * iteration; T on iteration t is A on iteration t+1, and so on.
+ * We use registers 7 - 12 for this.
+ */
+#define RT(t) ((((t)+5)%6)+7)
+#define RA(t) ((((t)+4)%6)+7)
+#define RB(t) ((((t)+3)%6)+7)
+#define RC(t) ((((t)+2)%6)+7)
+#define RD(t) ((((t)+1)%6)+7)
+#define RE(t) ((((t)+0)%6)+7)
+
+/* We use registers 16 - 31 for the W values */
+#define W(t) (((t)%16)+16)
+
+#define LOADW(t) \
+ LWZ(W(t),(t)*4,r4)
+
+#define STEPD0_LOAD(t) \
+ andc r0,RD(t),RB(t); \
+ and r6,RB(t),RC(t); \
+ rotlwi RT(t),RA(t),5; \
+ or r6,r6,r0; \
+ add r0,RE(t),r15; \
+ add RT(t),RT(t),r6; \
+ add r14,r0,W(t); \
+ LWZ(W((t)+4),((t)+4)*4,r4); \
+ rotlwi RB(t),RB(t),30; \
+ add RT(t),RT(t),r14
+
+#define STEPD0_UPDATE(t) \
+ and r6,RB(t),RC(t); \
+ andc r0,RD(t),RB(t); \
+ rotlwi RT(t),RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ or r6,r6,r0; \
+ add r0,RE(t),r15; \
+ xor r5,W((t)+4-3),W((t)+4-8); \
+ add RT(t),RT(t),r6; \
+ xor W((t)+4),W((t)+4-16),W((t)+4-14); \
+ add r0,r0,W(t); \
+ xor W((t)+4),W((t)+4),r5; \
+ add RT(t),RT(t),r0; \
+ rotlwi W((t)+4),W((t)+4),1
+
+#define STEPD1(t) \
+ xor r6,RB(t),RC(t); \
+ rotlwi RT(t),RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ xor r6,r6,RD(t); \
+ add r0,RE(t),r15; \
+ add RT(t),RT(t),r6; \
+ add r0,r0,W(t); \
+ add RT(t),RT(t),r0
+
+#define STEPD1_UPDATE(t) \
+ xor r6,RB(t),RC(t); \
+ rotlwi RT(t),RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ xor r6,r6,RD(t); \
+ add r0,RE(t),r15; \
+ xor r5,W((t)+4-3),W((t)+4-8); \
+ add RT(t),RT(t),r6; \
+ xor W((t)+4),W((t)+4-16),W((t)+4-14); \
+ add r0,r0,W(t); \
+ xor W((t)+4),W((t)+4),r5; \
+ add RT(t),RT(t),r0; \
+ rotlwi W((t)+4),W((t)+4),1
+
+#define STEPD2_UPDATE(t) \
+ and r6,RB(t),RC(t); \
+ and r0,RB(t),RD(t); \
+ rotlwi RT(t),RA(t),5; \
+ or r6,r6,r0; \
+ rotlwi RB(t),RB(t),30; \
+ and r0,RC(t),RD(t); \
+ xor r5,W((t)+4-3),W((t)+4-8); \
+ or r6,r6,r0; \
+ xor W((t)+4),W((t)+4-16),W((t)+4-14); \
+ add r0,RE(t),r15; \
+ add RT(t),RT(t),r6; \
+ add r0,r0,W(t); \
+ xor W((t)+4),W((t)+4),r5; \
+ add RT(t),RT(t),r0; \
+ rotlwi W((t)+4),W((t)+4),1
+
+#define STEP0LD4(t) \
+ STEPD0_LOAD(t); \
+ STEPD0_LOAD((t)+1); \
+ STEPD0_LOAD((t)+2); \
+ STEPD0_LOAD((t)+3)
+
+#define STEPUP4(t, fn) \
+ STEP##fn##_UPDATE(t); \
+ STEP##fn##_UPDATE((t)+1); \
+ STEP##fn##_UPDATE((t)+2); \
+ STEP##fn##_UPDATE((t)+3)
+
+#define STEPUP20(t, fn) \
+ STEPUP4(t, fn); \
+ STEPUP4((t)+4, fn); \
+ STEPUP4((t)+8, fn); \
+ STEPUP4((t)+12, fn); \
+ STEPUP4((t)+16, fn)
+
+_GLOBAL(powerpc_sha_transform)
+ PPC_STLU r1,-INT_FRAME_SIZE(r1)
+ SAVE_GPRS(14, 31, r1)
+
+ /* Load up A - E */
+ lwz RA(0),0(r3) /* A */
+ lwz RB(0),4(r3) /* B */
+ lwz RC(0),8(r3) /* C */
+ lwz RD(0),12(r3) /* D */
+ lwz RE(0),16(r3) /* E */
+
+ LOADW(0)
+ LOADW(1)
+ LOADW(2)
+ LOADW(3)
+
+ lis r15,0x5a82 /* K0-19 */
+ ori r15,r15,0x7999
+ STEP0LD4(0)
+ STEP0LD4(4)
+ STEP0LD4(8)
+ STEPUP4(12, D0)
+ STEPUP4(16, D0)
+
+ lis r15,0x6ed9 /* K20-39 */
+ ori r15,r15,0xeba1
+ STEPUP20(20, D1)
+
+ lis r15,0x8f1b /* K40-59 */
+ ori r15,r15,0xbcdc
+ STEPUP20(40, D2)
+
+ lis r15,0xca62 /* K60-79 */
+ ori r15,r15,0xc1d6
+ STEPUP4(60, D1)
+ STEPUP4(64, D1)
+ STEPUP4(68, D1)
+ STEPUP4(72, D1)
+ lwz r20,16(r3)
+ STEPD1(76)
+ lwz r19,12(r3)
+ STEPD1(77)
+ lwz r18,8(r3)
+ STEPD1(78)
+ lwz r17,4(r3)
+ STEPD1(79)
+
+ lwz r16,0(r3)
+ add r20,RE(80),r20
+ add RD(0),RD(80),r19
+ add RC(0),RC(80),r18
+ add RB(0),RB(80),r17
+ add RA(0),RA(80),r16
+ mr RE(0),r20
+ stw RA(0),0(r3)
+ stw RB(0),4(r3)
+ stw RC(0),8(r3)
+ stw RD(0),12(r3)
+ stw RE(0),16(r3)
+
+ REST_GPRS(14, 31, r1)
+ addi r1,r1,INT_FRAME_SIZE
+ blr
diff --git a/lib/crypto/powerpc/sha1-spe-asm.S b/lib/crypto/powerpc/sha1-spe-asm.S
new file mode 100644
index 000000000000..0f447523be5e
--- /dev/null
+++ b/lib/crypto/powerpc/sha1-spe-asm.S
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast SHA-1 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP r3 /* pointer to hash value */
+#define rWP r4 /* pointer to input */
+#define rKP r5 /* pointer to constants */
+
+#define rW0 r14 /* 64 bit round words */
+#define rW1 r15
+#define rW2 r16
+#define rW3 r17
+#define rW4 r18
+#define rW5 r19
+#define rW6 r20
+#define rW7 r21
+
+#define rH0 r6 /* 32 bit hash values */
+#define rH1 r7
+#define rH2 r8
+#define rH3 r9
+#define rH4 r10
+
+#define rT0 r22 /* 64 bit temporary */
+#define rT1 r0 /* 32 bit temporaries */
+#define rT2 r11
+#define rT3 r12
+
+#define rK r23 /* 64 bit constant in volatile register */
+
+#define LOAD_K01
+
+#define LOAD_K11 \
+ evlwwsplat rK,0(rKP);
+
+#define LOAD_K21 \
+ evlwwsplat rK,4(rKP);
+
+#define LOAD_K31 \
+ evlwwsplat rK,8(rKP);
+
+#define LOAD_K41 \
+ evlwwsplat rK,12(rKP);
+
+#define INITIALIZE \
+ stwu r1,-128(r1); /* create stack frame */ \
+ evstdw r14,8(r1); /* We must save non volatile */ \
+ evstdw r15,16(r1); /* registers. Take the chance */ \
+ evstdw r16,24(r1); /* and save the SPE part too */ \
+ evstdw r17,32(r1); \
+ evstdw r18,40(r1); \
+ evstdw r19,48(r1); \
+ evstdw r20,56(r1); \
+ evstdw r21,64(r1); \
+ evstdw r22,72(r1); \
+ evstdw r23,80(r1);
+
+
+#define FINALIZE \
+ evldw r14,8(r1); /* restore SPE registers */ \
+ evldw r15,16(r1); \
+ evldw r16,24(r1); \
+ evldw r17,32(r1); \
+ evldw r18,40(r1); \
+ evldw r19,48(r1); \
+ evldw r20,56(r1); \
+ evldw r21,64(r1); \
+ evldw r22,72(r1); \
+ evldw r23,80(r1); \
+ xor r0,r0,r0; \
+ stw r0,8(r1); /* Delete sensitive data */ \
+ stw r0,16(r1); /* that we might have pushed */ \
+ stw r0,24(r1); /* from other context that runs */ \
+ stw r0,32(r1); /* the same code. Assume that */ \
+ stw r0,40(r1); /* the lower part of the GPRs */ \
+ stw r0,48(r1); /* were already overwritten on */ \
+ stw r0,56(r1); /* the way down to here */ \
+ stw r0,64(r1); \
+ stw r0,72(r1); \
+ stw r0,80(r1); \
+ addi r1,r1,128; /* cleanup stack frame */
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+ lwz reg,off(rWP); /* load data */
+#define NEXT_BLOCK \
+ addi rWP,rWP,64; /* increment per block */
+#else
+#define LOAD_DATA(reg, off) \
+ lwbrx reg,0,rWP; /* load data */ \
+ addi rWP,rWP,4; /* increment per word */
+#define NEXT_BLOCK /* nothing to do */
+#endif
+
+#define R_00_15(a, b, c, d, e, w0, w1, k, off) \
+ LOAD_DATA(w0, off) /* 1: W */ \
+ and rT2,b,c; /* 1: F' = B and C */ \
+ LOAD_K##k##1 \
+ andc rT1,d,b; /* 1: F" = ~B and D */ \
+ rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \
+ or rT2,rT2,rT1; /* 1: F = F' or F" */ \
+ add e,e,rT0; /* 1: E = E + A' */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ add e,e,w0; /* 1: E = E + W */ \
+ LOAD_DATA(w1, off+4) /* 2: W */ \
+ add e,e,rT2; /* 1: E = E + F */ \
+ and rT1,a,b; /* 2: F' = B and C */ \
+ add e,e,rK; /* 1: E = E + K */ \
+ andc rT2,c,a; /* 2: F" = ~B and D */ \
+ add d,d,rK; /* 2: E = E + K */ \
+ or rT2,rT2,rT1; /* 2: F = F' or F" */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ add d,d,w1; /* 2: E = E + W */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ add d,d,rT0; /* 2: E = E + A' */ \
+ evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \
+ add d,d,rT2 /* 2: E = E + F */
+
+#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ and rT2,b,c; /* 1: F' = B and C */ \
+ evmergelohi rT0,w7,w6; /* W[-3] */ \
+ andc rT1,d,b; /* 1: F" = ~B and D */ \
+ evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
+ or rT1,rT1,rT2; /* 1: F = F' or F" */ \
+ evxor w0,w0,w4; /* W = W xor W[-8] */ \
+ add e,e,rT1; /* 1: E = E + F */ \
+ evxor w0,w0,w1; /* W = W xor W[-14] */ \
+ rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
+ evrlwi w0,w0,1; /* W = W rotl 1 */ \
+ add e,e,rT2; /* 1: E = E + A' */ \
+ evaddw rT0,w0,rK; /* WK = W + K */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ LOAD_K##k##1 \
+ evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
+ add e,e,rT0; /* 1: E = E + WK */ \
+ add d,d,rT1; /* 2: E = E + WK */ \
+ and rT2,a,b; /* 2: F' = B and C */ \
+ andc rT1,c,a; /* 2: F" = ~B and D */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ or rT1,rT1,rT2; /* 2: F = F' or F" */ \
+ add d,d,rT0; /* 2: E = E + A' */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ add d,d,rT1 /* 2: E = E + F */
+
+#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ evmergelohi rT0,w7,w6; /* W[-3] */ \
+ xor rT2,b,c; /* 1: F' = B xor C */ \
+ evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
+ xor rT2,rT2,d; /* 1: F = F' xor D */ \
+ evxor w0,w0,w4; /* W = W xor W[-8] */ \
+ add e,e,rT2; /* 1: E = E + F */ \
+ evxor w0,w0,w1; /* W = W xor W[-14] */ \
+ rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
+ evrlwi w0,w0,1; /* W = W rotl 1 */ \
+ add e,e,rT2; /* 1: E = E + A' */ \
+ evaddw rT0,w0,rK; /* WK = W + K */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ LOAD_K##k##1 \
+ evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
+ add e,e,rT0; /* 1: E = E + WK */ \
+ xor rT2,a,b; /* 2: F' = B xor C */ \
+ add d,d,rT1; /* 2: E = E + WK */ \
+ xor rT2,rT2,c; /* 2: F = F' xor D */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ add d,d,rT2; /* 2: E = E + F */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ add d,d,rT0 /* 2: E = E + A' */
+
+#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ and rT2,b,c; /* 1: F' = B and C */ \
+ evmergelohi rT0,w7,w6; /* W[-3] */ \
+ or rT1,b,c; /* 1: F" = B or C */ \
+ evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
+ and rT1,d,rT1; /* 1: F" = F" and D */ \
+ evxor w0,w0,w4; /* W = W xor W[-8] */ \
+ or rT2,rT2,rT1; /* 1: F = F' or F" */ \
+ evxor w0,w0,w1; /* W = W xor W[-14] */ \
+ add e,e,rT2; /* 1: E = E + F */ \
+ evrlwi w0,w0,1; /* W = W rotl 1 */ \
+ rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
+ evaddw rT0,w0,rK; /* WK = W + K */ \
+ add e,e,rT2; /* 1: E = E + A' */ \
+ LOAD_K##k##1 \
+ evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ add e,e,rT0; /* 1: E = E + WK */ \
+ and rT2,a,b; /* 2: F' = B and C */ \
+ or rT0,a,b; /* 2: F" = B or C */ \
+ add d,d,rT1; /* 2: E = E + WK */ \
+ and rT0,c,rT0; /* 2: F" = F" and D */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ or rT2,rT2,rT0; /* 2: F = F' or F" */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ add d,d,rT2; /* 2: E = E + F */ \
+ add d,d,rT0 /* 2: E = E + A' */
+
+#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
+
+_GLOBAL(ppc_spe_sha1_transform)
+ INITIALIZE
+
+ lwz rH0,0(rHP)
+ lwz rH1,4(rHP)
+ mtctr r5
+ lwz rH2,8(rHP)
+ lis rKP,PPC_SPE_SHA1_K@h
+ lwz rH3,12(rHP)
+ ori rKP,rKP,PPC_SPE_SHA1_K@l
+ lwz rH4,16(rHP)
+
+ppc_spe_sha1_main:
+ R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
+ R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
+ R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
+ R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
+ R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
+ R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
+ R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
+ R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
+
+ R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
+ R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
+
+ R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
+ R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
+ R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
+ R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
+ R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
+ R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
+ R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
+ R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
+ R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
+ R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
+
+ R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
+ R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
+ R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
+ R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
+ R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
+ R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
+ R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
+ R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
+ R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
+ R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
+
+ R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
+ R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
+ R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
+ R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
+ R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
+ R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
+ R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
+ lwz rT3,0(rHP)
+ R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
+ lwz rW1,4(rHP)
+ R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
+ lwz rW2,8(rHP)
+ R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
+ lwz rW3,12(rHP)
+ NEXT_BLOCK
+ lwz rW4,16(rHP)
+
+ add rH0,rH0,rT3
+ stw rH0,0(rHP)
+ add rH1,rH1,rW1
+ stw rH1,4(rHP)
+ add rH2,rH2,rW2
+ stw rH2,8(rHP)
+ add rH3,rH3,rW3
+ stw rH3,12(rHP)
+ add rH4,rH4,rW4
+ stw rH4,16(rHP)
+
+ bdnz ppc_spe_sha1_main
+
+ FINALIZE
+ blr
+
+.data
+.align 4
+PPC_SPE_SHA1_K:
+ .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
diff --git a/lib/crypto/powerpc/sha1.h b/lib/crypto/powerpc/sha1.h
new file mode 100644
index 000000000000..e2c010f0370b
--- /dev/null
+++ b/lib/crypto/powerpc/sha1.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for PowerPC
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <linux/preempt.h>
+
+#ifdef CONFIG_SPE
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA1 takes ~1000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 2KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 2048
+
+asmlinkage void ppc_spe_sha1_transform(struct sha1_block_state *state,
+ const u8 *data, u32 nblocks);
+
+static void spe_begin(void)
+{
+ /* We just start SPE operations and will save SPE registers later. */
+ preempt_disable();
+ enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+ disable_kernel_spe();
+ /* reenable preemption */
+ preempt_enable();
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ u32 unit = min_t(size_t, nblocks, MAX_BYTES / SHA1_BLOCK_SIZE);
+
+ spe_begin();
+ ppc_spe_sha1_transform(state, data, unit);
+ spe_end();
+
+ data += unit * SHA1_BLOCK_SIZE;
+ nblocks -= unit;
+ } while (nblocks);
+}
+#else /* CONFIG_SPE */
+asmlinkage void powerpc_sha_transform(struct sha1_block_state *state,
+ const u8 data[SHA1_BLOCK_SIZE]);
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ powerpc_sha_transform(state, data);
+ data += SHA1_BLOCK_SIZE;
+ } while (--nblocks);
+}
+#endif /* !CONFIG_SPE */
diff --git a/lib/crypto/powerpc/sha256-spe-asm.S b/lib/crypto/powerpc/sha256-spe-asm.S
new file mode 100644
index 000000000000..cd99d71dae34
--- /dev/null
+++ b/lib/crypto/powerpc/sha256-spe-asm.S
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast SHA-256 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP r3 /* pointer to hash values in memory */
+#define rKP r24 /* pointer to round constants */
+#define rWP r4 /* pointer to input data */
+
+#define rH0 r5 /* 8 32 bit hash values in 8 registers */
+#define rH1 r6
+#define rH2 r7
+#define rH3 r8
+#define rH4 r9
+#define rH5 r10
+#define rH6 r11
+#define rH7 r12
+
+#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */
+#define rW1 r15
+#define rW2 r16
+#define rW3 r17
+#define rW4 r18
+#define rW5 r19
+#define rW6 r20
+#define rW7 r21
+
+#define rT0 r22 /* 64 bit temporaries */
+#define rT1 r23
+#define rT2 r0 /* 32 bit temporaries */
+#define rT3 r25
+
+#define CMP_KN_LOOP
+#define CMP_KC_LOOP \
+ cmpwi rT1,0;
+
+#define INITIALIZE \
+ stwu r1,-128(r1); /* create stack frame */ \
+ evstdw r14,8(r1); /* We must save non volatile */ \
+ evstdw r15,16(r1); /* registers. Take the chance */ \
+ evstdw r16,24(r1); /* and save the SPE part too */ \
+ evstdw r17,32(r1); \
+ evstdw r18,40(r1); \
+ evstdw r19,48(r1); \
+ evstdw r20,56(r1); \
+ evstdw r21,64(r1); \
+ evstdw r22,72(r1); \
+ evstdw r23,80(r1); \
+ stw r24,88(r1); /* save normal registers */ \
+ stw r25,92(r1);
+
+
+#define FINALIZE \
+ evldw r14,8(r1); /* restore SPE registers */ \
+ evldw r15,16(r1); \
+ evldw r16,24(r1); \
+ evldw r17,32(r1); \
+ evldw r18,40(r1); \
+ evldw r19,48(r1); \
+ evldw r20,56(r1); \
+ evldw r21,64(r1); \
+ evldw r22,72(r1); \
+ evldw r23,80(r1); \
+ lwz r24,88(r1); /* restore normal registers */ \
+ lwz r25,92(r1); \
+ xor r0,r0,r0; \
+ stw r0,8(r1); /* Delete sensitive data */ \
+ stw r0,16(r1); /* that we might have pushed */ \
+ stw r0,24(r1); /* from other context that runs */ \
+ stw r0,32(r1); /* the same code. Assume that */ \
+ stw r0,40(r1); /* the lower part of the GPRs */ \
+ stw r0,48(r1); /* was already overwritten on */ \
+ stw r0,56(r1); /* the way down to here */ \
+ stw r0,64(r1); \
+ stw r0,72(r1); \
+ stw r0,80(r1); \
+ addi r1,r1,128; /* cleanup stack frame */
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+ lwz reg,off(rWP); /* load data */
+#define NEXT_BLOCK \
+ addi rWP,rWP,64; /* increment per block */
+#else
+#define LOAD_DATA(reg, off) \
+ lwbrx reg,0,rWP; /* load data */ \
+ addi rWP,rWP,4; /* increment per word */
+#define NEXT_BLOCK /* nothing to do */
+#endif
+
+#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
+ LOAD_DATA(w, off) /* 1: W */ \
+ rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \
+ rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \
+ rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \
+ and rT3,e,f; /* 1: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \
+ andc rT1,g,e; /* 1: ch' = ~e and g */ \
+ lwz rT2,off(rKP); /* 1: K */ \
+ xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \
+ add h,h,rT0; /* 1: temp1 = h + S1 */ \
+ add rT3,rT3,w; /* 1: temp1' = ch + w */ \
+ rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \
+ add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \
+ rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \
+ add h,h,rT2; /* 1: temp1 = temp1 + K */ \
+ rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \
+ add d,d,h; /* 1: d = d + temp1 */ \
+ xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \
+ evmergelo w,w,w; /* shift W */ \
+ or rT2,a,b; /* 1: maj = a or b */ \
+ and rT1,a,b; /* 1: maj' = a and b */ \
+ and rT2,rT2,c; /* 1: maj = maj and c */ \
+ LOAD_DATA(w, off+4) /* 2: W */ \
+ or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \
+ rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
+ add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \
+ rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
+ add h,h,rT3; /* 1: h = temp1 + temp2 */ \
+ rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
+ and rT3,d,e; /* 2: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
+ andc rT1,f,d; /* 2: ch' = ~e and g */ \
+ lwz rT2,off+4(rKP); /* 2: K */ \
+ xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
+ add g,g,rT0; /* 2: temp1 = h + S1 */ \
+ add rT3,rT3,w; /* 2: temp1' = ch + w */ \
+ rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
+ add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \
+ rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
+ add g,g,rT2; /* 2: temp1 = temp1 + K */ \
+ rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
+ or rT2,h,a; /* 2: maj = a or b */ \
+ xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
+ and rT1,h,a; /* 2: maj' = a and b */ \
+ and rT2,rT2,b; /* 2: maj = maj and c */ \
+ add c,c,g; /* 2: d = d + temp1 */ \
+ or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
+ add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
+ add g,g,rT3 /* 2: h = temp1 + temp2 */
+
+#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
+ rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \
+ evmergelohi rT0,w0,w1; /* w[-15] */ \
+ rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \
+ evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \
+ xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
+ evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \
+ rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \
+ evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \
+ xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
+ evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \
+ add h,h,rT2; /* 1: temp1 = h + S1 */ \
+ evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \
+ and rT2,e,f; /* 1: ch = e and f */ \
+ evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \
+ andc rT3,g,e; /* 1: ch' = ~e and g */ \
+ evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \
+ xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \
+ evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \
+ add h,h,rT2; /* 1: temp1 = temp1 + ch */ \
+ evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
+ rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \
+ evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \
+ rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \
+ evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
+ xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
+ evldw rT1,off(rKP); /* k */ \
+ rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \
+ evaddw w0,w0,rT0; /* w = w + s1 */ \
+ xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
+ evmergelohi rT0,w4,w5; /* w[-7] */ \
+ and rT3,a,b; /* 1: maj = a and b */ \
+ evaddw w0,w0,rT0; /* w = w + w[-7] */ \
+ CMP_K##k##_LOOP \
+ add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \
+ evaddw rT1,rT1,w0; /* wk = w + k */ \
+ xor rT3,a,b; /* 1: maj = a xor b */ \
+ evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \
+ and rT3,rT3,c; /* 1: maj = maj and c */ \
+ add h,h,rT0; /* 1: temp1 = temp1 + wk */ \
+ add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \
+ add g,g,rT1; /* 2: temp1 = temp1 + wk */ \
+ add d,d,h; /* 1: d = d + temp1 */ \
+ rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
+ add h,h,rT2; /* 1: h = temp1 + temp2 */ \
+ rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
+ rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
+ and rT3,d,e; /* 2: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
+ andc rT1,f,d; /* 2: ch' = ~e and g */ \
+ add g,g,rT0; /* 2: temp1 = h + S1 */ \
+ xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
+ rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
+ add g,g,rT3; /* 2: temp1 = temp1 + ch */ \
+ rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
+ rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
+ or rT2,h,a; /* 2: maj = a or b */ \
+ and rT1,h,a; /* 2: maj' = a and b */ \
+ and rT2,rT2,b; /* 2: maj = maj and c */ \
+ xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
+ or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
+ add c,c,g; /* 2: d = d + temp1 */ \
+ add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
+ add g,g,rT3 /* 2: h = temp1 + temp2 */
+
+_GLOBAL(ppc_spe_sha256_transform)
+ INITIALIZE
+
+ mtctr r5
+ lwz rH0,0(rHP)
+ lwz rH1,4(rHP)
+ lwz rH2,8(rHP)
+ lwz rH3,12(rHP)
+ lwz rH4,16(rHP)
+ lwz rH5,20(rHP)
+ lwz rH6,24(rHP)
+ lwz rH7,28(rHP)
+
+ppc_spe_sha256_main:
+ lis rKP,PPC_SPE_SHA256_K@ha
+ addi rKP,rKP,PPC_SPE_SHA256_K@l
+
+ R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
+ R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
+ R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
+ R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
+ R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
+ R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
+ R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
+ R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
+ppc_spe_sha256_16_rounds:
+ addi rKP,rKP,64
+ R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+ rW0, rW1, rW4, rW5, rW7, N, 0)
+ R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+ rW1, rW2, rW5, rW6, rW0, N, 8)
+ R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+ rW2, rW3, rW6, rW7, rW1, N, 16)
+ R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+ rW3, rW4, rW7, rW0, rW2, N, 24)
+ R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+ rW4, rW5, rW0, rW1, rW3, N, 32)
+ R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+ rW5, rW6, rW1, rW2, rW4, N, 40)
+ R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+ rW6, rW7, rW2, rW3, rW5, N, 48)
+ R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+ rW7, rW0, rW3, rW4, rW6, C, 56)
+ bt gt,ppc_spe_sha256_16_rounds
+
+ lwz rW0,0(rHP)
+ NEXT_BLOCK
+ lwz rW1,4(rHP)
+ lwz rW2,8(rHP)
+ lwz rW3,12(rHP)
+ lwz rW4,16(rHP)
+ lwz rW5,20(rHP)
+ lwz rW6,24(rHP)
+ lwz rW7,28(rHP)
+
+ add rH0,rH0,rW0
+ stw rH0,0(rHP)
+ add rH1,rH1,rW1
+ stw rH1,4(rHP)
+ add rH2,rH2,rW2
+ stw rH2,8(rHP)
+ add rH3,rH3,rW3
+ stw rH3,12(rHP)
+ add rH4,rH4,rW4
+ stw rH4,16(rHP)
+ add rH5,rH5,rW5
+ stw rH5,20(rHP)
+ add rH6,rH6,rW6
+ stw rH6,24(rHP)
+ add rH7,rH7,rW7
+ stw rH7,28(rHP)
+
+ bdnz ppc_spe_sha256_main
+
+ FINALIZE
+ blr
+
+.data
+.align 5
+PPC_SPE_SHA256_K:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/lib/crypto/powerpc/sha256.h b/lib/crypto/powerpc/sha256.h
new file mode 100644
index 000000000000..50d355441c7e
--- /dev/null
+++ b/lib/crypto/powerpc/sha256.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 Secure Hash Algorithm, SPE optimized
+ *
+ * Based on generic implementation. The assembler module takes care
+ * about the SPE registers so it can run from interrupt context.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <linux/preempt.h>
+
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 1024
+
+extern void ppc_spe_sha256_transform(struct sha256_block_state *state,
+ const u8 *src, u32 blocks);
+
+static void spe_begin(void)
+{
+ /* We just start SPE operations and will save SPE registers later. */
+ preempt_disable();
+ enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+ disable_kernel_spe();
+ /* reenable preemption */
+ preempt_enable();
+}
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ /* cut input data into smaller blocks */
+ u32 unit = min_t(size_t, nblocks,
+ MAX_BYTES / SHA256_BLOCK_SIZE);
+
+ spe_begin();
+ ppc_spe_sha256_transform(state, data, unit);
+ spe_end();
+
+ data += unit * SHA256_BLOCK_SIZE;
+ nblocks -= unit;
+ } while (nblocks);
+}
diff --git a/lib/crypto/riscv/chacha-riscv64-zvkb.S b/lib/crypto/riscv/chacha-riscv64-zvkb.S
new file mode 100644
index 000000000000..b777d0b4e379
--- /dev/null
+++ b/lib/crypto/riscv/chacha-riscv64-zvkb.S
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvkb
+
+#define STATEP a0
+#define INP a1
+#define OUTP a2
+#define NBLOCKS a3
+#define NROUNDS a4
+
+#define CONSTS0 a5
+#define CONSTS1 a6
+#define CONSTS2 a7
+#define CONSTS3 t0
+#define TMP t1
+#define VL t2
+#define STRIDE t3
+#define ROUND_CTR t4
+#define KEY0 s0
+#define KEY1 s1
+#define KEY2 s2
+#define KEY3 s3
+#define KEY4 s4
+#define KEY5 s5
+#define KEY6 s6
+#define KEY7 s7
+#define COUNTER s8
+#define NONCE0 s9
+#define NONCE1 s10
+#define NONCE2 s11
+
+.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \
+ a2, b2, c2, d2, a3, b3, c3, d3
+ // a += b; d ^= a; d = rol(d, 16);
+ vadd.vv \a0, \a0, \b0
+ vadd.vv \a1, \a1, \b1
+ vadd.vv \a2, \a2, \b2
+ vadd.vv \a3, \a3, \b3
+ vxor.vv \d0, \d0, \a0
+ vxor.vv \d1, \d1, \a1
+ vxor.vv \d2, \d2, \a2
+ vxor.vv \d3, \d3, \a3
+ vror.vi \d0, \d0, 32 - 16
+ vror.vi \d1, \d1, 32 - 16
+ vror.vi \d2, \d2, 32 - 16
+ vror.vi \d3, \d3, 32 - 16
+
+ // c += d; b ^= c; b = rol(b, 12);
+ vadd.vv \c0, \c0, \d0
+ vadd.vv \c1, \c1, \d1
+ vadd.vv \c2, \c2, \d2
+ vadd.vv \c3, \c3, \d3
+ vxor.vv \b0, \b0, \c0
+ vxor.vv \b1, \b1, \c1
+ vxor.vv \b2, \b2, \c2
+ vxor.vv \b3, \b3, \c3
+ vror.vi \b0, \b0, 32 - 12
+ vror.vi \b1, \b1, 32 - 12
+ vror.vi \b2, \b2, 32 - 12
+ vror.vi \b3, \b3, 32 - 12
+
+ // a += b; d ^= a; d = rol(d, 8);
+ vadd.vv \a0, \a0, \b0
+ vadd.vv \a1, \a1, \b1
+ vadd.vv \a2, \a2, \b2
+ vadd.vv \a3, \a3, \b3
+ vxor.vv \d0, \d0, \a0
+ vxor.vv \d1, \d1, \a1
+ vxor.vv \d2, \d2, \a2
+ vxor.vv \d3, \d3, \a3
+ vror.vi \d0, \d0, 32 - 8
+ vror.vi \d1, \d1, 32 - 8
+ vror.vi \d2, \d2, 32 - 8
+ vror.vi \d3, \d3, 32 - 8
+
+ // c += d; b ^= c; b = rol(b, 7);
+ vadd.vv \c0, \c0, \d0
+ vadd.vv \c1, \c1, \d1
+ vadd.vv \c2, \c2, \d2
+ vadd.vv \c3, \c3, \d3
+ vxor.vv \b0, \b0, \c0
+ vxor.vv \b1, \b1, \c1
+ vxor.vv \b2, \b2, \c2
+ vxor.vv \b3, \b3, \c3
+ vror.vi \b0, \b0, 32 - 7
+ vror.vi \b1, \b1, 32 - 7
+ vror.vi \b2, \b2, 32 - 7
+ vror.vi \b3, \b3, 32 - 7
+.endm
+
+// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
+// size_t nblocks, int nrounds);
+//
+// |nblocks| is the number of 64-byte blocks to process, and must be nonzero.
+//
+// |state| gives the ChaCha state matrix, including the 32-bit counter in
+// state->x[12] following the RFC7539 convention; note that this differs from
+// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13].
+// The updated 32-bit counter is written back to state->x[12] before returning.
+SYM_FUNC_START(chacha_zvkb)
+ addi sp, sp, -96
+ sd s0, 0(sp)
+ sd s1, 8(sp)
+ sd s2, 16(sp)
+ sd s3, 24(sp)
+ sd s4, 32(sp)
+ sd s5, 40(sp)
+ sd s6, 48(sp)
+ sd s7, 56(sp)
+ sd s8, 64(sp)
+ sd s9, 72(sp)
+ sd s10, 80(sp)
+ sd s11, 88(sp)
+
+ li STRIDE, 64
+
+ // Set up the initial state matrix in scalar registers.
+ lw CONSTS0, 0(STATEP)
+ lw CONSTS1, 4(STATEP)
+ lw CONSTS2, 8(STATEP)
+ lw CONSTS3, 12(STATEP)
+ lw KEY0, 16(STATEP)
+ lw KEY1, 20(STATEP)
+ lw KEY2, 24(STATEP)
+ lw KEY3, 28(STATEP)
+ lw KEY4, 32(STATEP)
+ lw KEY5, 36(STATEP)
+ lw KEY6, 40(STATEP)
+ lw KEY7, 44(STATEP)
+ lw COUNTER, 48(STATEP)
+ lw NONCE0, 52(STATEP)
+ lw NONCE1, 56(STATEP)
+ lw NONCE2, 60(STATEP)
+
+.Lblock_loop:
+ // Set vl to the number of blocks to process in this iteration.
+ vsetvli VL, NBLOCKS, e32, m1, ta, ma
+
+ // Set up the initial state matrix for the next VL blocks in v0-v15.
+ // v{i} holds the i'th 32-bit word of the state matrix for all blocks.
+ // Note that only the counter word, at index 12, differs across blocks.
+ vmv.v.x v0, CONSTS0
+ vmv.v.x v1, CONSTS1
+ vmv.v.x v2, CONSTS2
+ vmv.v.x v3, CONSTS3
+ vmv.v.x v4, KEY0
+ vmv.v.x v5, KEY1
+ vmv.v.x v6, KEY2
+ vmv.v.x v7, KEY3
+ vmv.v.x v8, KEY4
+ vmv.v.x v9, KEY5
+ vmv.v.x v10, KEY6
+ vmv.v.x v11, KEY7
+ vid.v v12
+ vadd.vx v12, v12, COUNTER
+ vmv.v.x v13, NONCE0
+ vmv.v.x v14, NONCE1
+ vmv.v.x v15, NONCE2
+
+ // Load the first half of the input data for each block into v16-v23.
+ // v{16+i} holds the i'th 32-bit word for all blocks.
+ vlsseg8e32.v v16, (INP), STRIDE
+
+ mv ROUND_CTR, NROUNDS
+.Lnext_doubleround:
+ addi ROUND_CTR, ROUND_CTR, -2
+ // column round
+ chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \
+ v2, v6, v10, v14, v3, v7, v11, v15
+ // diagonal round
+ chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \
+ v2, v7, v8, v13, v3, v4, v9, v14
+ bnez ROUND_CTR, .Lnext_doubleround
+
+ // Load the second half of the input data for each block into v24-v31.
+ // v{24+i} holds the {8+i}'th 32-bit word for all blocks.
+ addi TMP, INP, 32
+ vlsseg8e32.v v24, (TMP), STRIDE
+
+ // Finalize the first half of the keystream for each block.
+ vadd.vx v0, v0, CONSTS0
+ vadd.vx v1, v1, CONSTS1
+ vadd.vx v2, v2, CONSTS2
+ vadd.vx v3, v3, CONSTS3
+ vadd.vx v4, v4, KEY0
+ vadd.vx v5, v5, KEY1
+ vadd.vx v6, v6, KEY2
+ vadd.vx v7, v7, KEY3
+
+ // Encrypt/decrypt the first half of the data for each block.
+ vxor.vv v16, v16, v0
+ vxor.vv v17, v17, v1
+ vxor.vv v18, v18, v2
+ vxor.vv v19, v19, v3
+ vxor.vv v20, v20, v4
+ vxor.vv v21, v21, v5
+ vxor.vv v22, v22, v6
+ vxor.vv v23, v23, v7
+
+ // Store the first half of the output data for each block.
+ vssseg8e32.v v16, (OUTP), STRIDE
+
+ // Finalize the second half of the keystream for each block.
+ vadd.vx v8, v8, KEY4
+ vadd.vx v9, v9, KEY5
+ vadd.vx v10, v10, KEY6
+ vadd.vx v11, v11, KEY7
+ vid.v v0
+ vadd.vx v12, v12, COUNTER
+ vadd.vx v13, v13, NONCE0
+ vadd.vx v14, v14, NONCE1
+ vadd.vx v15, v15, NONCE2
+ vadd.vv v12, v12, v0
+
+ // Encrypt/decrypt the second half of the data for each block.
+ vxor.vv v24, v24, v8
+ vxor.vv v25, v25, v9
+ vxor.vv v26, v26, v10
+ vxor.vv v27, v27, v11
+ vxor.vv v29, v29, v13
+ vxor.vv v28, v28, v12
+ vxor.vv v30, v30, v14
+ vxor.vv v31, v31, v15
+
+ // Store the second half of the output data for each block.
+ addi TMP, OUTP, 32
+ vssseg8e32.v v24, (TMP), STRIDE
+
+ // Update the counter, the remaining number of blocks, and the input and
+ // output pointers according to the number of blocks processed (VL).
+ add COUNTER, COUNTER, VL
+ sub NBLOCKS, NBLOCKS, VL
+ slli TMP, VL, 6
+ add OUTP, OUTP, TMP
+ add INP, INP, TMP
+ bnez NBLOCKS, .Lblock_loop
+
+ sw COUNTER, 48(STATEP)
+ ld s0, 0(sp)
+ ld s1, 8(sp)
+ ld s2, 16(sp)
+ ld s3, 24(sp)
+ ld s4, 32(sp)
+ ld s5, 40(sp)
+ ld s6, 48(sp)
+ ld s7, 56(sp)
+ ld s8, 64(sp)
+ ld s9, 72(sp)
+ ld s10, 80(sp)
+ ld s11, 88(sp)
+ addi sp, sp, 96
+ ret
+SYM_FUNC_END(chacha_zvkb)
diff --git a/lib/crypto/riscv/chacha.h b/lib/crypto/riscv/chacha.h
new file mode 100644
index 000000000000..5c000c6aef4b
--- /dev/null
+++ b/lib/crypto/riscv/chacha.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * ChaCha stream cipher (RISC-V optimized)
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/simd.h>
+#include <linux/linkage.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb);
+
+asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
+ size_t nblocks, int nrounds);
+
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ u8 block_buffer[CHACHA_BLOCK_SIZE];
+ unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE;
+ unsigned int tail_bytes = bytes % CHACHA_BLOCK_SIZE;
+
+ if (!static_branch_likely(&use_zvkb) || !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ kernel_vector_begin();
+ if (full_blocks) {
+ chacha_zvkb(state, src, dst, full_blocks, nrounds);
+ src += full_blocks * CHACHA_BLOCK_SIZE;
+ dst += full_blocks * CHACHA_BLOCK_SIZE;
+ }
+ if (tail_bytes) {
+ memcpy(block_buffer, src, tail_bytes);
+ chacha_zvkb(state, block_buffer, block_buffer, 1, nrounds);
+ memcpy(dst, block_buffer, tail_bytes);
+ }
+ kernel_vector_end();
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+ if (riscv_isa_extension_available(NULL, ZVKB) &&
+ riscv_vector_vlen() >= 128)
+ static_branch_enable(&use_zvkb);
+}
diff --git a/lib/crypto/riscv/poly1305-riscv.pl b/lib/crypto/riscv/poly1305-riscv.pl
new file mode 100644
index 000000000000..e25e6338a9ac
--- /dev/null
+++ b/lib/crypto/riscv/poly1305-riscv.pl
@@ -0,0 +1,847 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
+# ====================================================================
+#
+# Poly1305 hash for RISC-V.
+#
+# February 2019
+#
+# In the essence it's pretty straightforward transliteration of MIPS
+# module [without big-endian option].
+#
+# 1.8 cycles per byte on U74, >100% faster than compiler-generated
+# code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69%
+# improvement.
+#
+# June 2024.
+#
+# Add CHERI support.
+#
+######################################################################
+#
+($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4));
+($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27));
+#
+######################################################################
+
+$flavour = shift || "64";
+
+for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
+open STDOUT,">$output";
+
+$code.=<<___;
+#ifdef __KERNEL__
+# ifdef __riscv_zicfilp
+# undef __riscv_zicfilp // calls are expected to be direct
+# endif
+#endif
+
+#if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast)
+# define __riscv_misaligned_fast 1
+#endif
+___
+
+if ($flavour =~ /64/) {{{
+######################################################################
+# 64-bit code path...
+#
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2);
+
+$code.=<<___;
+#if __riscv_xlen == 64
+# if __SIZEOF_POINTER__ == 16
+# define PUSH csc
+# define POP clc
+# else
+# define PUSH sd
+# define POP ld
+# endif
+#else
+# error "unsupported __riscv_xlen"
+#endif
+
+.option pic
+.text
+
+.globl poly1305_init
+.type poly1305_init,\@function
+poly1305_init:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ sd $zero,0($ctx)
+ sd $zero,8($ctx)
+ sd $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+#ifndef __riscv_misaligned_fast
+ andi $tmp0,$inp,7 # $inp % 8
+ andi $inp,$inp,-8 # align $inp
+ slli $tmp0,$tmp0,3 # byte to bit offset
+#endif
+ ld $in0,0($inp)
+ ld $in1,8($inp)
+#ifndef __riscv_misaligned_fast
+ beqz $tmp0,.Laligned_key
+
+ ld $tmp2,16($inp)
+ neg $tmp1,$tmp0 # implicit &63 in sll
+ srl $in0,$in0,$tmp0
+ sll $tmp3,$in1,$tmp1
+ srl $in1,$in1,$tmp0
+ sll $tmp2,$tmp2,$tmp1
+ or $in0,$in0,$tmp3
+ or $in1,$in1,$tmp2
+
+.Laligned_key:
+#endif
+ li $tmp0,1
+ slli $tmp0,$tmp0,32 # 0x0000000100000000
+ addi $tmp0,$tmp0,-63 # 0x00000000ffffffc1
+ slli $tmp0,$tmp0,28 # 0x0ffffffc10000000
+ addi $tmp0,$tmp0,-1 # 0x0ffffffc0fffffff
+
+ and $in0,$in0,$tmp0
+ addi $tmp0,$tmp0,-3 # 0x0ffffffc0ffffffc
+ and $in1,$in1,$tmp0
+
+ sd $in0,24($ctx)
+ srli $tmp0,$in1,2
+ sd $in1,32($ctx)
+ add $tmp0,$tmp0,$in1 # s1 = r1 + (r1 >> 2)
+ sd $tmp0,40($ctx)
+
+.Lno_key:
+ li $a0,0 # return 0
+ ret
+.size poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
+ ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2);
+my ($shr,$shl) = ($t5,$t6); # used on R6
+
+$code.=<<___;
+.globl poly1305_blocks
+.type poly1305_blocks,\@function
+poly1305_blocks:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ andi $len,$len,-16 # complete blocks only
+ beqz $len,.Lno_data
+
+ caddi $sp,$sp,-4*__SIZEOF_POINTER__
+ PUSH $s0,3*__SIZEOF_POINTER__($sp)
+ PUSH $s1,2*__SIZEOF_POINTER__($sp)
+ PUSH $s2,1*__SIZEOF_POINTER__($sp)
+ PUSH $s3,0*__SIZEOF_POINTER__($sp)
+
+#ifndef __riscv_misaligned_fast
+ andi $shr,$inp,7
+ andi $inp,$inp,-8 # align $inp
+ slli $shr,$shr,3 # byte to bit offset
+ neg $shl,$shr # implicit &63 in sll
+#endif
+
+ ld $h0,0($ctx) # load hash value
+ ld $h1,8($ctx)
+ ld $h2,16($ctx)
+
+ ld $r0,24($ctx) # load key
+ ld $r1,32($ctx)
+ ld $rs1,40($ctx)
+
+ add $len,$len,$inp # end of buffer
+
+.Loop:
+ ld $in0,0($inp) # load input
+ ld $in1,8($inp)
+#ifndef __riscv_misaligned_fast
+ beqz $shr,.Laligned_inp
+
+ ld $tmp2,16($inp)
+ srl $in0,$in0,$shr
+ sll $tmp3,$in1,$shl
+ srl $in1,$in1,$shr
+ sll $tmp2,$tmp2,$shl
+ or $in0,$in0,$tmp3
+ or $in1,$in1,$tmp2
+
+.Laligned_inp:
+#endif
+ caddi $inp,$inp,16
+
+ andi $tmp0,$h2,-4 # modulo-scheduled reduction
+ srli $tmp1,$h2,2
+ andi $h2,$h2,3
+
+ add $d0,$h0,$in0 # accumulate input
+ add $tmp1,$tmp1,$tmp0
+ sltu $tmp0,$d0,$h0
+ add $d0,$d0,$tmp1 # ... and residue
+ sltu $tmp1,$d0,$tmp1
+ add $d1,$h1,$in1
+ add $tmp0,$tmp0,$tmp1
+ sltu $tmp1,$d1,$h1
+ add $d1,$d1,$tmp0
+
+ add $d2,$h2,$padbit
+ sltu $tmp0,$d1,$tmp0
+ mulhu $h1,$r0,$d0 # h0*r0
+ mul $h0,$r0,$d0
+
+ add $d2,$d2,$tmp1
+ add $d2,$d2,$tmp0
+ mulhu $tmp1,$rs1,$d1 # h1*5*r1
+ mul $tmp0,$rs1,$d1
+
+ mulhu $h2,$r1,$d0 # h0*r1
+ mul $tmp2,$r1,$d0
+ add $h0,$h0,$tmp0
+ add $h1,$h1,$tmp1
+ sltu $tmp0,$h0,$tmp0
+
+ add $h1,$h1,$tmp0
+ add $h1,$h1,$tmp2
+ mulhu $tmp1,$r0,$d1 # h1*r0
+ mul $tmp0,$r0,$d1
+
+ sltu $tmp2,$h1,$tmp2
+ add $h2,$h2,$tmp2
+ mul $tmp2,$rs1,$d2 # h2*5*r1
+
+ add $h1,$h1,$tmp0
+ add $h2,$h2,$tmp1
+ mul $tmp3,$r0,$d2 # h2*r0
+ sltu $tmp0,$h1,$tmp0
+ add $h2,$h2,$tmp0
+
+ add $h1,$h1,$tmp2
+ sltu $tmp2,$h1,$tmp2
+ add $h2,$h2,$tmp2
+ add $h2,$h2,$tmp3
+
+ bne $inp,$len,.Loop
+
+ sd $h0,0($ctx) # store hash value
+ sd $h1,8($ctx)
+ sd $h2,16($ctx)
+
+ POP $s0,3*__SIZEOF_POINTER__($sp) # epilogue
+ POP $s1,2*__SIZEOF_POINTER__($sp)
+ POP $s2,1*__SIZEOF_POINTER__($sp)
+ POP $s3,0*__SIZEOF_POINTER__($sp)
+ caddi $sp,$sp,4*__SIZEOF_POINTER__
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.globl poly1305_emit
+.type poly1305_emit,\@function
+poly1305_emit:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ ld $tmp2,16($ctx)
+ ld $tmp0,0($ctx)
+ ld $tmp1,8($ctx)
+
+ andi $in0,$tmp2,-4 # final reduction
+ srl $in1,$tmp2,2
+ andi $tmp2,$tmp2,3
+ add $in0,$in0,$in1
+
+ add $tmp0,$tmp0,$in0
+ sltu $in1,$tmp0,$in0
+ addi $in0,$tmp0,5 # compare to modulus
+ add $tmp1,$tmp1,$in1
+ sltiu $tmp3,$in0,5
+ sltu $tmp4,$tmp1,$in1
+ add $in1,$tmp1,$tmp3
+ add $tmp2,$tmp2,$tmp4
+ sltu $tmp3,$in1,$tmp3
+ add $tmp2,$tmp2,$tmp3
+
+ srli $tmp2,$tmp2,2 # see if it carried/borrowed
+ neg $tmp2,$tmp2
+
+ xor $in0,$in0,$tmp0
+ xor $in1,$in1,$tmp1
+ and $in0,$in0,$tmp2
+ and $in1,$in1,$tmp2
+ xor $in0,$in0,$tmp0
+ xor $in1,$in1,$tmp1
+
+ lwu $tmp0,0($nonce) # load nonce
+ lwu $tmp1,4($nonce)
+ lwu $tmp2,8($nonce)
+ lwu $tmp3,12($nonce)
+ slli $tmp1,$tmp1,32
+ slli $tmp3,$tmp3,32
+ or $tmp0,$tmp0,$tmp1
+ or $tmp2,$tmp2,$tmp3
+
+ add $in0,$in0,$tmp0 # accumulate nonce
+ add $in1,$in1,$tmp2
+ sltu $tmp0,$in0,$tmp0
+ add $in1,$in1,$tmp0
+
+#ifdef __riscv_misaligned_fast
+ sd $in0,0($mac) # write mac value
+ sd $in1,8($mac)
+#else
+ srli $tmp0,$in0,8 # write mac value
+ srli $tmp1,$in0,16
+ srli $tmp2,$in0,24
+ sb $in0,0($mac)
+ srli $tmp3,$in0,32
+ sb $tmp0,1($mac)
+ srli $tmp0,$in0,40
+ sb $tmp1,2($mac)
+ srli $tmp1,$in0,48
+ sb $tmp2,3($mac)
+ srli $tmp2,$in0,56
+ sb $tmp3,4($mac)
+ srli $tmp3,$in1,8
+ sb $tmp0,5($mac)
+ srli $tmp0,$in1,16
+ sb $tmp1,6($mac)
+ srli $tmp1,$in1,24
+ sb $tmp2,7($mac)
+
+ sb $in1,8($mac)
+ srli $tmp2,$in1,32
+ sb $tmp3,9($mac)
+ srli $tmp3,$in1,40
+ sb $tmp0,10($mac)
+ srli $tmp0,$in1,48
+ sb $tmp1,11($mac)
+ srli $tmp1,$in1,56
+ sb $tmp2,12($mac)
+ sb $tmp3,13($mac)
+ sb $tmp0,14($mac)
+ sb $tmp1,15($mac)
+#endif
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
+___
+}
+}}} else {{{
+######################################################################
+# 32-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
+ ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3);
+
+$code.=<<___;
+#if __riscv_xlen == 32
+# if __SIZEOF_POINTER__ == 8
+# define PUSH csc
+# define POP clc
+# else
+# define PUSH sw
+# define POP lw
+# endif
+# define MULX(hi,lo,a,b) mulhu hi,a,b; mul lo,a,b
+# define srliw srli
+# define srlw srl
+# define sllw sll
+# define addw add
+# define addiw addi
+# define mulw mul
+#elif __riscv_xlen == 64
+# if __SIZEOF_POINTER__ == 16
+# define PUSH csc
+# define POP clc
+# else
+# define PUSH sd
+# define POP ld
+# endif
+# define MULX(hi,lo,a,b) slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32
+#else
+# error "unsupported __riscv_xlen"
+#endif
+
+.option pic
+.text
+
+.globl poly1305_init
+.type poly1305_init,\@function
+poly1305_init:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ sw $zero,0($ctx)
+ sw $zero,4($ctx)
+ sw $zero,8($ctx)
+ sw $zero,12($ctx)
+ sw $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+#ifndef __riscv_misaligned_fast
+ andi $tmp0,$inp,3 # $inp % 4
+ sub $inp,$inp,$tmp0 # align $inp
+ sll $tmp0,$tmp0,3 # byte to bit offset
+#endif
+ lw $in0,0($inp)
+ lw $in1,4($inp)
+ lw $in2,8($inp)
+ lw $in3,12($inp)
+#ifndef __riscv_misaligned_fast
+ beqz $tmp0,.Laligned_key
+
+ lw $tmp2,16($inp)
+ sub $tmp1,$zero,$tmp0
+ srlw $in0,$in0,$tmp0
+ sllw $tmp3,$in1,$tmp1
+ srlw $in1,$in1,$tmp0
+ or $in0,$in0,$tmp3
+ sllw $tmp3,$in2,$tmp1
+ srlw $in2,$in2,$tmp0
+ or $in1,$in1,$tmp3
+ sllw $tmp3,$in3,$tmp1
+ srlw $in3,$in3,$tmp0
+ or $in2,$in2,$tmp3
+ sllw $tmp2,$tmp2,$tmp1
+ or $in3,$in3,$tmp2
+.Laligned_key:
+#endif
+
+ lui $tmp0,0x10000
+ addi $tmp0,$tmp0,-1 # 0x0fffffff
+ and $in0,$in0,$tmp0
+ addi $tmp0,$tmp0,-3 # 0x0ffffffc
+ and $in1,$in1,$tmp0
+ and $in2,$in2,$tmp0
+ and $in3,$in3,$tmp0
+
+ sw $in0,20($ctx)
+ sw $in1,24($ctx)
+ sw $in2,28($ctx)
+ sw $in3,32($ctx)
+
+ srlw $tmp1,$in1,2
+ srlw $tmp2,$in2,2
+ srlw $tmp3,$in3,2
+ addw $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
+ addw $in2,$in2,$tmp2
+ addw $in3,$in3,$tmp3
+ sw $in1,36($ctx)
+ sw $in2,40($ctx)
+ sw $in3,44($ctx)
+.Lno_key:
+ li $a0,0
+ ret
+.size poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
+ ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2);
+my ($d0,$d1,$d2,$d3) =
+ ($a4,$a5,$a6,$a7);
+my $shr = $ra; # used on R6
+
+$code.=<<___;
+.globl poly1305_blocks
+.type poly1305_blocks,\@function
+poly1305_blocks:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ andi $len,$len,-16 # complete blocks only
+ beqz $len,.Labort
+
+#ifdef __riscv_zcmp
+ cm.push {ra,s0-s8}, -48
+#else
+ caddi $sp,$sp,-__SIZEOF_POINTER__*12
+ PUSH $ra, __SIZEOF_POINTER__*11($sp)
+ PUSH $s0, __SIZEOF_POINTER__*10($sp)
+ PUSH $s1, __SIZEOF_POINTER__*9($sp)
+ PUSH $s2, __SIZEOF_POINTER__*8($sp)
+ PUSH $s3, __SIZEOF_POINTER__*7($sp)
+ PUSH $s4, __SIZEOF_POINTER__*6($sp)
+ PUSH $s5, __SIZEOF_POINTER__*5($sp)
+ PUSH $s6, __SIZEOF_POINTER__*4($sp)
+ PUSH $s7, __SIZEOF_POINTER__*3($sp)
+ PUSH $s8, __SIZEOF_POINTER__*2($sp)
+#endif
+
+#ifndef __riscv_misaligned_fast
+ andi $shr,$inp,3
+ andi $inp,$inp,-4 # align $inp
+ slli $shr,$shr,3 # byte to bit offset
+#endif
+
+ lw $h0,0($ctx) # load hash value
+ lw $h1,4($ctx)
+ lw $h2,8($ctx)
+ lw $h3,12($ctx)
+ lw $h4,16($ctx)
+
+ lw $r0,20($ctx) # load key
+ lw $r1,24($ctx)
+ lw $r2,28($ctx)
+ lw $r3,32($ctx)
+ lw $rs1,36($ctx)
+ lw $rs2,40($ctx)
+ lw $rs3,44($ctx)
+
+ add $len,$len,$inp # end of buffer
+
+.Loop:
+ lw $d0,0($inp) # load input
+ lw $d1,4($inp)
+ lw $d2,8($inp)
+ lw $d3,12($inp)
+#ifndef __riscv_misaligned_fast
+ beqz $shr,.Laligned_inp
+
+ lw $t4,16($inp)
+ sub $t5,$zero,$shr
+ srlw $d0,$d0,$shr
+ sllw $t3,$d1,$t5
+ srlw $d1,$d1,$shr
+ or $d0,$d0,$t3
+ sllw $t3,$d2,$t5
+ srlw $d2,$d2,$shr
+ or $d1,$d1,$t3
+ sllw $t3,$d3,$t5
+ srlw $d3,$d3,$shr
+ or $d2,$d2,$t3
+ sllw $t4,$t4,$t5
+ or $d3,$d3,$t4
+
+.Laligned_inp:
+#endif
+ srliw $t3,$h4,2 # modulo-scheduled reduction
+ andi $t4,$h4,-4
+ andi $h4,$h4,3
+
+ addw $d0,$d0,$h0 # accumulate input
+ addw $t4,$t4,$t3
+ sltu $h0,$d0,$h0
+ addw $d0,$d0,$t4 # ... and residue
+ sltu $t4,$d0,$t4
+
+ addw $d1,$d1,$h1
+ addw $h0,$h0,$t4 # carry
+ sltu $h1,$d1,$h1
+ addw $d1,$d1,$h0
+ sltu $h0,$d1,$h0
+
+ addw $d2,$d2,$h2
+ addw $h1,$h1,$h0 # carry
+ sltu $h2,$d2,$h2
+ addw $d2,$d2,$h1
+ sltu $h1,$d2,$h1
+
+ addw $d3,$d3,$h3
+ addw $h2,$h2,$h1 # carry
+ sltu $h3,$d3,$h3
+ addw $d3,$d3,$h2
+
+ MULX ($h1,$h0,$r0,$d0) # d0*r0
+
+ sltu $h2,$d3,$h2
+ addw $h3,$h3,$h2 # carry
+
+ MULX ($t4,$t3,$rs3,$d1) # d1*s3
+
+ addw $h4,$h4,$padbit
+ caddi $inp,$inp,16
+ addw $h4,$h4,$h3
+
+ MULX ($t6,$a3,$rs2,$d2) # d2*s2
+ addw $h0,$h0,$t3
+ addw $h1,$h1,$t4
+ sltu $t3,$h0,$t3
+ addw $h1,$h1,$t3
+
+ MULX ($t4,$t3,$rs1,$d3) # d3*s1
+ addw $h0,$h0,$a3
+ addw $h1,$h1,$t6
+ sltu $a3,$h0,$a3
+ addw $h1,$h1,$a3
+
+
+ MULX ($h2,$a3,$r1,$d0) # d0*r1
+ addw $h0,$h0,$t3
+ addw $h1,$h1,$t4
+ sltu $t3,$h0,$t3
+ addw $h1,$h1,$t3
+
+ MULX ($t4,$t3,$r0,$d1) # d1*r0
+ addw $h1,$h1,$a3
+ sltu $a3,$h1,$a3
+ addw $h2,$h2,$a3
+
+ MULX ($t6,$a3,$rs3,$d2) # d2*s3
+ addw $h1,$h1,$t3
+ addw $h2,$h2,$t4
+ sltu $t3,$h1,$t3
+ addw $h2,$h2,$t3
+
+ MULX ($t4,$t3,$rs2,$d3) # d3*s2
+ addw $h1,$h1,$a3
+ addw $h2,$h2,$t6
+ sltu $a3,$h1,$a3
+ addw $h2,$h2,$a3
+
+ mulw $a3,$rs1,$h4 # h4*s1
+ addw $h1,$h1,$t3
+ addw $h2,$h2,$t4
+ sltu $t3,$h1,$t3
+ addw $h2,$h2,$t3
+
+
+ MULX ($h3,$t3,$r2,$d0) # d0*r2
+ addw $h1,$h1,$a3
+ sltu $a3,$h1,$a3
+ addw $h2,$h2,$a3
+
+ MULX ($t6,$a3,$r1,$d1) # d1*r1
+ addw $h2,$h2,$t3
+ sltu $t3,$h2,$t3
+ addw $h3,$h3,$t3
+
+ MULX ($t4,$t3,$r0,$d2) # d2*r0
+ addw $h2,$h2,$a3
+ addw $h3,$h3,$t6
+ sltu $a3,$h2,$a3
+ addw $h3,$h3,$a3
+
+ MULX ($t6,$a3,$rs3,$d3) # d3*s3
+ addw $h2,$h2,$t3
+ addw $h3,$h3,$t4
+ sltu $t3,$h2,$t3
+ addw $h3,$h3,$t3
+
+ mulw $t3,$rs2,$h4 # h4*s2
+ addw $h2,$h2,$a3
+ addw $h3,$h3,$t6
+ sltu $a3,$h2,$a3
+ addw $h3,$h3,$a3
+
+
+ MULX ($t6,$a3,$r3,$d0) # d0*r3
+ addw $h2,$h2,$t3
+ sltu $t3,$h2,$t3
+ addw $h3,$h3,$t3
+
+ MULX ($t4,$t3,$r2,$d1) # d1*r2
+ addw $h3,$h3,$a3
+ sltu $a3,$h3,$a3
+ addw $t6,$t6,$a3
+
+ MULX ($a3,$d3,$r0,$d3) # d3*r0
+ addw $h3,$h3,$t3
+ addw $t6,$t6,$t4
+ sltu $t3,$h3,$t3
+ addw $t6,$t6,$t3
+
+ MULX ($t4,$t3,$r1,$d2) # d2*r1
+ addw $h3,$h3,$d3
+ addw $t6,$t6,$a3
+ sltu $d3,$h3,$d3
+ addw $t6,$t6,$d3
+
+ mulw $a3,$rs3,$h4 # h4*s3
+ addw $h3,$h3,$t3
+ addw $t6,$t6,$t4
+ sltu $t3,$h3,$t3
+ addw $t6,$t6,$t3
+
+
+ mulw $h4,$r0,$h4 # h4*r0
+ addw $h3,$h3,$a3
+ sltu $a3,$h3,$a3
+ addw $t6,$t6,$a3
+ addw $h4,$t6,$h4
+
+ li $padbit,1 # if we loop, padbit is 1
+
+ bne $inp,$len,.Loop
+
+ sw $h0,0($ctx) # store hash value
+ sw $h1,4($ctx)
+ sw $h2,8($ctx)
+ sw $h3,12($ctx)
+ sw $h4,16($ctx)
+
+#ifdef __riscv_zcmp
+ cm.popret {ra,s0-s8}, 48
+#else
+ POP $ra, __SIZEOF_POINTER__*11($sp)
+ POP $s0, __SIZEOF_POINTER__*10($sp)
+ POP $s1, __SIZEOF_POINTER__*9($sp)
+ POP $s2, __SIZEOF_POINTER__*8($sp)
+ POP $s3, __SIZEOF_POINTER__*7($sp)
+ POP $s4, __SIZEOF_POINTER__*6($sp)
+ POP $s5, __SIZEOF_POINTER__*5($sp)
+ POP $s6, __SIZEOF_POINTER__*4($sp)
+ POP $s7, __SIZEOF_POINTER__*3($sp)
+ POP $s8, __SIZEOF_POINTER__*2($sp)
+ caddi $sp,$sp,__SIZEOF_POINTER__*12
+#endif
+.Labort:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
+
+$code.=<<___;
+.globl poly1305_emit
+.type poly1305_emit,\@function
+poly1305_emit:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ lw $tmp4,16($ctx)
+ lw $tmp0,0($ctx)
+ lw $tmp1,4($ctx)
+ lw $tmp2,8($ctx)
+ lw $tmp3,12($ctx)
+
+ srliw $ctx,$tmp4,2 # final reduction
+ andi $in0,$tmp4,-4
+ andi $tmp4,$tmp4,3
+ addw $ctx,$ctx,$in0
+
+ addw $tmp0,$tmp0,$ctx
+ sltu $ctx,$tmp0,$ctx
+ addiw $in0,$tmp0,5 # compare to modulus
+ addw $tmp1,$tmp1,$ctx
+ sltiu $in1,$in0,5
+ sltu $ctx,$tmp1,$ctx
+ addw $in1,$in1,$tmp1
+ addw $tmp2,$tmp2,$ctx
+ sltu $in2,$in1,$tmp1
+ sltu $ctx,$tmp2,$ctx
+ addw $in2,$in2,$tmp2
+ addw $tmp3,$tmp3,$ctx
+ sltu $in3,$in2,$tmp2
+ sltu $ctx,$tmp3,$ctx
+ addw $in3,$in3,$tmp3
+ addw $tmp4,$tmp4,$ctx
+ sltu $ctx,$in3,$tmp3
+ addw $ctx,$ctx,$tmp4
+
+ srl $ctx,$ctx,2 # see if it carried/borrowed
+ sub $ctx,$zero,$ctx
+
+ xor $in0,$in0,$tmp0
+ xor $in1,$in1,$tmp1
+ xor $in2,$in2,$tmp2
+ xor $in3,$in3,$tmp3
+ and $in0,$in0,$ctx
+ and $in1,$in1,$ctx
+ and $in2,$in2,$ctx
+ and $in3,$in3,$ctx
+ xor $in0,$in0,$tmp0
+ xor $in1,$in1,$tmp1
+ xor $in2,$in2,$tmp2
+ xor $in3,$in3,$tmp3
+
+ lw $tmp0,0($nonce) # load nonce
+ lw $tmp1,4($nonce)
+ lw $tmp2,8($nonce)
+ lw $tmp3,12($nonce)
+
+ addw $in0,$in0,$tmp0 # accumulate nonce
+ sltu $ctx,$in0,$tmp0
+
+ addw $in1,$in1,$tmp1
+ sltu $tmp1,$in1,$tmp1
+ addw $in1,$in1,$ctx
+ sltu $ctx,$in1,$ctx
+ addw $ctx,$ctx,$tmp1
+
+ addw $in2,$in2,$tmp2
+ sltu $tmp2,$in2,$tmp2
+ addw $in2,$in2,$ctx
+ sltu $ctx,$in2,$ctx
+ addw $ctx,$ctx,$tmp2
+
+ addw $in3,$in3,$tmp3
+ addw $in3,$in3,$ctx
+
+#ifdef __riscv_misaligned_fast
+ sw $in0,0($mac) # write mac value
+ sw $in1,4($mac)
+ sw $in2,8($mac)
+ sw $in3,12($mac)
+#else
+ srl $tmp0,$in0,8 # write mac value
+ srl $tmp1,$in0,16
+ srl $tmp2,$in0,24
+ sb $in0, 0($mac)
+ sb $tmp0,1($mac)
+ srl $tmp0,$in1,8
+ sb $tmp1,2($mac)
+ srl $tmp1,$in1,16
+ sb $tmp2,3($mac)
+ srl $tmp2,$in1,24
+ sb $in1, 4($mac)
+ sb $tmp0,5($mac)
+ srl $tmp0,$in2,8
+ sb $tmp1,6($mac)
+ srl $tmp1,$in2,16
+ sb $tmp2,7($mac)
+ srl $tmp2,$in2,24
+ sb $in2, 8($mac)
+ sb $tmp0,9($mac)
+ srl $tmp0,$in3,8
+ sb $tmp1,10($mac)
+ srl $tmp1,$in3,16
+ sb $tmp2,11($mac)
+ srl $tmp2,$in3,24
+ sb $in3, 12($mac)
+ sb $tmp0,13($mac)
+ sb $tmp1,14($mac)
+ sb $tmp2,15($mac)
+#endif
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
+___
+}
+}}}
+
+foreach (split("\n", $code)) {
+ if ($flavour =~ /^cheri/) {
+ s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/;
+ s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or
+ s/\b(ret|jal)\b/c$1/;
+ s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or
+ m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g;
+ } else {
+ s/\bcaddi?\b/add/ or
+ s/\bcmove\b/mv/;
+ }
+ print $_, "\n";
+}
+
+close STDOUT;
diff --git a/lib/crypto/riscv/poly1305.h b/lib/crypto/riscv/poly1305.h
new file mode 100644
index 000000000000..88f3df44e355
--- /dev/null
+++ b/lib/crypto/riscv/poly1305.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for riscv
+ *
+ * Copyright (C) 2025 Institute of Software, CAS.
+ */
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+ u8 digest[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
diff --git a/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
new file mode 100644
index 000000000000..1618d1220a6e
--- /dev/null
+++ b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvknha, +zvkb
+
+#define STATEP a0
+#define DATA a1
+#define NUM_BLOCKS a2
+
+#define STATEP_C a3
+
+#define MASK v0
+#define INDICES v1
+#define W0 v2
+#define W1 v3
+#define W2 v4
+#define W3 v5
+#define VTMP v6
+#define FEBA v7
+#define HGDC v8
+#define K0 v10
+#define K1 v11
+#define K2 v12
+#define K3 v13
+#define K4 v14
+#define K5 v15
+#define K6 v16
+#define K7 v17
+#define K8 v18
+#define K9 v19
+#define K10 v20
+#define K11 v21
+#define K12 v22
+#define K13 v23
+#define K14 v24
+#define K15 v25
+#define PREV_FEBA v26
+#define PREV_HGDC v27
+
+// Do 4 rounds of SHA-256. w0 contains the current 4 message schedule words.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4
+// message schedule words; this macro computes the group after w3 and writes it
+// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
+// w0), so the caller must cycle through the registers accordingly.
+.macro sha256_4rounds last, k, w0, w1, w2, w3
+ vadd.vv VTMP, \k, \w0
+ vsha2cl.vv HGDC, FEBA, VTMP
+ vsha2ch.vv FEBA, HGDC, VTMP
+.if !\last
+ vmerge.vvm VTMP, \w2, \w1, MASK
+ vsha2ms.vv \w0, VTMP, \w3
+.endif
+.endm
+
+.macro sha256_16rounds last, k0, k1, k2, k3
+ sha256_4rounds \last, \k0, W0, W1, W2, W3
+ sha256_4rounds \last, \k1, W1, W2, W3, W0
+ sha256_4rounds \last, \k2, W2, W3, W0, W1
+ sha256_4rounds \last, \k3, W3, W0, W1, W2
+.endm
+
+// void sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
+// const u8 *data, size_t nblocks);
+SYM_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb)
+
+ // Load the round constants into K0-K15.
+ vsetivli zero, 4, e32, m1, ta, ma
+ la t0, K256
+ vle32.v K0, (t0)
+ addi t0, t0, 16
+ vle32.v K1, (t0)
+ addi t0, t0, 16
+ vle32.v K2, (t0)
+ addi t0, t0, 16
+ vle32.v K3, (t0)
+ addi t0, t0, 16
+ vle32.v K4, (t0)
+ addi t0, t0, 16
+ vle32.v K5, (t0)
+ addi t0, t0, 16
+ vle32.v K6, (t0)
+ addi t0, t0, 16
+ vle32.v K7, (t0)
+ addi t0, t0, 16
+ vle32.v K8, (t0)
+ addi t0, t0, 16
+ vle32.v K9, (t0)
+ addi t0, t0, 16
+ vle32.v K10, (t0)
+ addi t0, t0, 16
+ vle32.v K11, (t0)
+ addi t0, t0, 16
+ vle32.v K12, (t0)
+ addi t0, t0, 16
+ vle32.v K13, (t0)
+ addi t0, t0, 16
+ vle32.v K14, (t0)
+ addi t0, t0, 16
+ vle32.v K15, (t0)
+
+ // Setup mask for the vmerge to replace the first word (idx==0) in
+ // message scheduling. There are 4 words, so an 8-bit mask suffices.
+ vsetivli zero, 1, e8, m1, ta, ma
+ vmv.v.i MASK, 0x01
+
+ // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we
+ // need {f,e,b,a},{h,g,d,c}. The dst vtype is e32m1 and the index vtype
+ // is e8mf4. We use index-load with the i8 indices {20, 16, 4, 0},
+ // loaded using the 32-bit little endian value 0x00041014.
+ li t0, 0x00041014
+ vsetivli zero, 1, e32, m1, ta, ma
+ vmv.v.x INDICES, t0
+ addi STATEP_C, STATEP, 8
+ vsetivli zero, 4, e32, m1, ta, ma
+ vluxei8.v FEBA, (STATEP), INDICES
+ vluxei8.v HGDC, (STATEP_C), INDICES
+
+.Lnext_block:
+ addi NUM_BLOCKS, NUM_BLOCKS, -1
+
+ // Save the previous state, as it's needed later.
+ vmv.v.v PREV_FEBA, FEBA
+ vmv.v.v PREV_HGDC, HGDC
+
+ // Load the next 512-bit message block and endian-swap each 32-bit word.
+ vle32.v W0, (DATA)
+ vrev8.v W0, W0
+ addi DATA, DATA, 16
+ vle32.v W1, (DATA)
+ vrev8.v W1, W1
+ addi DATA, DATA, 16
+ vle32.v W2, (DATA)
+ vrev8.v W2, W2
+ addi DATA, DATA, 16
+ vle32.v W3, (DATA)
+ vrev8.v W3, W3
+ addi DATA, DATA, 16
+
+ // Do the 64 rounds of SHA-256.
+ sha256_16rounds 0, K0, K1, K2, K3
+ sha256_16rounds 0, K4, K5, K6, K7
+ sha256_16rounds 0, K8, K9, K10, K11
+ sha256_16rounds 1, K12, K13, K14, K15
+
+ // Add the previous state.
+ vadd.vv FEBA, FEBA, PREV_FEBA
+ vadd.vv HGDC, HGDC, PREV_HGDC
+
+ // Repeat if more blocks remain.
+ bnez NUM_BLOCKS, .Lnext_block
+
+ // Store the new state and return.
+ vsuxei8.v FEBA, (STATEP), INDICES
+ vsuxei8.v HGDC, (STATEP_C), INDICES
+ ret
+SYM_FUNC_END(sha256_transform_zvknha_or_zvknhb_zvkb)
+
+.section ".rodata"
+.p2align 2
+.type K256, @object
+K256:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+.size K256, . - K256
diff --git a/lib/crypto/riscv/sha256.h b/lib/crypto/riscv/sha256.h
new file mode 100644
index 000000000000..1def18b0a4fb
--- /dev/null
+++ b/lib/crypto/riscv/sha256.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 (RISC-V accelerated)
+ *
+ * Copyright (C) 2022 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
+
+asmlinkage void
+sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_extensions) && likely(may_use_simd())) {
+ kernel_vector_begin();
+ sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks);
+ kernel_vector_end();
+ } else {
+ sha256_blocks_generic(state, data, nblocks);
+ }
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ /* Both zvknha and zvknhb provide the SHA-256 instructions. */
+ if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
+ riscv_isa_extension_available(NULL, ZVKNHB)) &&
+ riscv_isa_extension_available(NULL, ZVKB) &&
+ riscv_vector_vlen() >= 128)
+ static_branch_enable(&have_extensions);
+}
diff --git a/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S
new file mode 100644
index 000000000000..b41eebf60546
--- /dev/null
+++ b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvknhb, +zvkb
+
+#define STATEP a0
+#define DATA a1
+#define NUM_BLOCKS a2
+
+#define STATEP_C a3
+#define K a4
+
+#define MASK v0
+#define INDICES v1
+#define W0 v10 // LMUL=2
+#define W1 v12 // LMUL=2
+#define W2 v14 // LMUL=2
+#define W3 v16 // LMUL=2
+#define VTMP v20 // LMUL=2
+#define FEBA v22 // LMUL=2
+#define HGDC v24 // LMUL=2
+#define PREV_FEBA v26 // LMUL=2
+#define PREV_HGDC v28 // LMUL=2
+
+// Do 4 rounds of SHA-512. w0 contains the current 4 message schedule words.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4
+// message schedule words; this macro computes the group after w3 and writes it
+// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
+// w0), so the caller must cycle through the registers accordingly.
+.macro sha512_4rounds last, w0, w1, w2, w3
+ vle64.v VTMP, (K)
+ addi K, K, 32
+ vadd.vv VTMP, VTMP, \w0
+ vsha2cl.vv HGDC, FEBA, VTMP
+ vsha2ch.vv FEBA, HGDC, VTMP
+.if !\last
+ vmerge.vvm VTMP, \w2, \w1, MASK
+ vsha2ms.vv \w0, VTMP, \w3
+.endif
+.endm
+
+.macro sha512_16rounds last
+ sha512_4rounds \last, W0, W1, W2, W3
+ sha512_4rounds \last, W1, W2, W3, W0
+ sha512_4rounds \last, W2, W3, W0, W1
+ sha512_4rounds \last, W3, W0, W1, W2
+.endm
+
+// void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state,
+// const u8 *data, size_t nblocks);
+SYM_FUNC_START(sha512_transform_zvknhb_zvkb)
+
+ // Setup mask for the vmerge to replace the first word (idx==0) in
+ // message scheduling. There are 4 words, so an 8-bit mask suffices.
+ vsetivli zero, 1, e8, m1, ta, ma
+ vmv.v.i MASK, 0x01
+
+ // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we
+ // need {f,e,b,a},{h,g,d,c}. The dst vtype is e64m2 and the index vtype
+ // is e8mf4. We use index-load with the i8 indices {40, 32, 8, 0},
+ // loaded using the 32-bit little endian value 0x00082028.
+ li t0, 0x00082028
+ vsetivli zero, 1, e32, m1, ta, ma
+ vmv.v.x INDICES, t0
+ addi STATEP_C, STATEP, 16
+ vsetivli zero, 4, e64, m2, ta, ma
+ vluxei8.v FEBA, (STATEP), INDICES
+ vluxei8.v HGDC, (STATEP_C), INDICES
+
+.Lnext_block:
+ la K, K512
+ addi NUM_BLOCKS, NUM_BLOCKS, -1
+
+ // Save the previous state, as it's needed later.
+ vmv.v.v PREV_FEBA, FEBA
+ vmv.v.v PREV_HGDC, HGDC
+
+ // Load the next 1024-bit message block and endian-swap each 64-bit word
+ vle64.v W0, (DATA)
+ vrev8.v W0, W0
+ addi DATA, DATA, 32
+ vle64.v W1, (DATA)
+ vrev8.v W1, W1
+ addi DATA, DATA, 32
+ vle64.v W2, (DATA)
+ vrev8.v W2, W2
+ addi DATA, DATA, 32
+ vle64.v W3, (DATA)
+ vrev8.v W3, W3
+ addi DATA, DATA, 32
+
+ // Do the 80 rounds of SHA-512.
+ sha512_16rounds 0
+ sha512_16rounds 0
+ sha512_16rounds 0
+ sha512_16rounds 0
+ sha512_16rounds 1
+
+ // Add the previous state.
+ vadd.vv FEBA, FEBA, PREV_FEBA
+ vadd.vv HGDC, HGDC, PREV_HGDC
+
+ // Repeat if more blocks remain.
+ bnez NUM_BLOCKS, .Lnext_block
+
+ // Store the new state and return.
+ vsuxei8.v FEBA, (STATEP), INDICES
+ vsuxei8.v HGDC, (STATEP_C), INDICES
+ ret
+SYM_FUNC_END(sha512_transform_zvknhb_zvkb)
+
+.section ".rodata"
+.p2align 3
+.type K512, @object
+K512:
+ .dword 0x428a2f98d728ae22, 0x7137449123ef65cd
+ .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+ .dword 0x3956c25bf348b538, 0x59f111f1b605d019
+ .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+ .dword 0xd807aa98a3030242, 0x12835b0145706fbe
+ .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+ .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+ .dword 0x9bdc06a725c71235, 0xc19bf174cf692694
+ .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+ .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+ .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+ .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+ .dword 0x983e5152ee66dfab, 0xa831c66d2db43210
+ .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
+ .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
+ .dword 0x06ca6351e003826f, 0x142929670a0e6e70
+ .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
+ .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+ .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
+ .dword 0x81c2c92e47edaee6, 0x92722c851482353b
+ .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
+ .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
+ .dword 0xd192e819d6ef5218, 0xd69906245565a910
+ .dword 0xf40e35855771202a, 0x106aa07032bbd1b8
+ .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+ .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+ .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+ .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+ .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
+ .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
+ .dword 0x90befffa23631e28, 0xa4506cebde82bde9
+ .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
+ .dword 0xca273eceea26619c, 0xd186b8c721c0c207
+ .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+ .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
+ .dword 0x113f9804bef90dae, 0x1b710b35131c471b
+ .dword 0x28db77f523047d84, 0x32caab7b40c72493
+ .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+ .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+ .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+.size K512, . - K512
diff --git a/lib/crypto/riscv/sha512.h b/lib/crypto/riscv/sha512.h
new file mode 100644
index 000000000000..145bdab1214e
--- /dev/null
+++ b/lib/crypto/riscv/sha512.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-512 and SHA-384 using the RISC-V vector crypto extensions
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
+
+asmlinkage void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_extensions) && likely(may_use_simd())) {
+ kernel_vector_begin();
+ sha512_transform_zvknhb_zvkb(state, data, nblocks);
+ kernel_vector_end();
+ } else {
+ sha512_blocks_generic(state, data, nblocks);
+ }
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ if (riscv_isa_extension_available(NULL, ZVKNHB) &&
+ riscv_isa_extension_available(NULL, ZVKB) &&
+ riscv_vector_vlen() >= 128)
+ static_branch_enable(&have_extensions);
+}
diff --git a/lib/crypto/s390/chacha-s390.S b/lib/crypto/s390/chacha-s390.S
new file mode 100644
index 000000000000..63f3102678c0
--- /dev/null
+++ b/lib/crypto/s390/chacha-s390.S
@@ -0,0 +1,908 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Original implementation written by Andy Polyakov, @dot-asm.
+ * This is an adaptation of the original code for kernel use.
+ *
+ * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+#include <asm/fpu-insn.h>
+
+#define SP %r15
+#define FRAME (16 * 8 + 4 * 8)
+
+ .data
+ .balign 32
+
+SYM_DATA_START_LOCAL(sigma)
+ .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
+ .long 1,0,0,0
+ .long 2,0,0,0
+ .long 3,0,0,0
+ .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
+
+ .long 0,1,2,3
+ .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma
+ .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e
+ .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32
+ .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574
+SYM_DATA_END(sigma)
+
+ .previous
+
+ GEN_BR_THUNK %r14
+
+ .text
+
+#############################################################################
+# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
+# counst u32 *key, const u32 *counter)
+
+#define OUT %r2
+#define INP %r3
+#define LEN %r4
+#define KEY %r5
+#define COUNTER %r6
+
+#define BEPERM %v31
+#define CTR %v26
+
+#define K0 %v16
+#define K1 %v17
+#define K2 %v18
+#define K3 %v19
+
+#define XA0 %v0
+#define XA1 %v1
+#define XA2 %v2
+#define XA3 %v3
+
+#define XB0 %v4
+#define XB1 %v5
+#define XB2 %v6
+#define XB3 %v7
+
+#define XC0 %v8
+#define XC1 %v9
+#define XC2 %v10
+#define XC3 %v11
+
+#define XD0 %v12
+#define XD1 %v13
+#define XD2 %v14
+#define XD3 %v15
+
+#define XT0 %v27
+#define XT1 %v28
+#define XT2 %v29
+#define XT3 %v30
+
+SYM_FUNC_START(chacha20_vx_4x)
+ stmg %r6,%r7,6*8(SP)
+
+ larl %r7,sigma
+ lhi %r0,10
+ lhi %r1,0
+
+ VL K0,0,,%r7 # load sigma
+ VL K1,0,,KEY # load key
+ VL K2,16,,KEY
+ VL K3,0,,COUNTER # load counter
+
+ VL BEPERM,0x40,,%r7
+ VL CTR,0x50,,%r7
+
+ VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma
+
+ VREPF XB0,K1,0 # smash the key
+ VREPF XB1,K1,1
+ VREPF XB2,K1,2
+ VREPF XB3,K1,3
+
+ VREPF XD0,K3,0
+ VREPF XD1,K3,1
+ VREPF XD2,K3,2
+ VREPF XD3,K3,3
+ VAF XD0,XD0,CTR
+
+ VREPF XC0,K2,0
+ VREPF XC1,K2,1
+ VREPF XC2,K2,2
+ VREPF XC3,K2,3
+
+.Loop_4x:
+ VAF XA0,XA0,XB0
+ VX XD0,XD0,XA0
+ VERLLF XD0,XD0,16
+
+ VAF XA1,XA1,XB1
+ VX XD1,XD1,XA1
+ VERLLF XD1,XD1,16
+
+ VAF XA2,XA2,XB2
+ VX XD2,XD2,XA2
+ VERLLF XD2,XD2,16
+
+ VAF XA3,XA3,XB3
+ VX XD3,XD3,XA3
+ VERLLF XD3,XD3,16
+
+ VAF XC0,XC0,XD0
+ VX XB0,XB0,XC0
+ VERLLF XB0,XB0,12
+
+ VAF XC1,XC1,XD1
+ VX XB1,XB1,XC1
+ VERLLF XB1,XB1,12
+
+ VAF XC2,XC2,XD2
+ VX XB2,XB2,XC2
+ VERLLF XB2,XB2,12
+
+ VAF XC3,XC3,XD3
+ VX XB3,XB3,XC3
+ VERLLF XB3,XB3,12
+
+ VAF XA0,XA0,XB0
+ VX XD0,XD0,XA0
+ VERLLF XD0,XD0,8
+
+ VAF XA1,XA1,XB1
+ VX XD1,XD1,XA1
+ VERLLF XD1,XD1,8
+
+ VAF XA2,XA2,XB2
+ VX XD2,XD2,XA2
+ VERLLF XD2,XD2,8
+
+ VAF XA3,XA3,XB3
+ VX XD3,XD3,XA3
+ VERLLF XD3,XD3,8
+
+ VAF XC0,XC0,XD0
+ VX XB0,XB0,XC0
+ VERLLF XB0,XB0,7
+
+ VAF XC1,XC1,XD1
+ VX XB1,XB1,XC1
+ VERLLF XB1,XB1,7
+
+ VAF XC2,XC2,XD2
+ VX XB2,XB2,XC2
+ VERLLF XB2,XB2,7
+
+ VAF XC3,XC3,XD3
+ VX XB3,XB3,XC3
+ VERLLF XB3,XB3,7
+
+ VAF XA0,XA0,XB1
+ VX XD3,XD3,XA0
+ VERLLF XD3,XD3,16
+
+ VAF XA1,XA1,XB2
+ VX XD0,XD0,XA1
+ VERLLF XD0,XD0,16
+
+ VAF XA2,XA2,XB3
+ VX XD1,XD1,XA2
+ VERLLF XD1,XD1,16
+
+ VAF XA3,XA3,XB0
+ VX XD2,XD2,XA3
+ VERLLF XD2,XD2,16
+
+ VAF XC2,XC2,XD3
+ VX XB1,XB1,XC2
+ VERLLF XB1,XB1,12
+
+ VAF XC3,XC3,XD0
+ VX XB2,XB2,XC3
+ VERLLF XB2,XB2,12
+
+ VAF XC0,XC0,XD1
+ VX XB3,XB3,XC0
+ VERLLF XB3,XB3,12
+
+ VAF XC1,XC1,XD2
+ VX XB0,XB0,XC1
+ VERLLF XB0,XB0,12
+
+ VAF XA0,XA0,XB1
+ VX XD3,XD3,XA0
+ VERLLF XD3,XD3,8
+
+ VAF XA1,XA1,XB2
+ VX XD0,XD0,XA1
+ VERLLF XD0,XD0,8
+
+ VAF XA2,XA2,XB3
+ VX XD1,XD1,XA2
+ VERLLF XD1,XD1,8
+
+ VAF XA3,XA3,XB0
+ VX XD2,XD2,XA3
+ VERLLF XD2,XD2,8
+
+ VAF XC2,XC2,XD3
+ VX XB1,XB1,XC2
+ VERLLF XB1,XB1,7
+
+ VAF XC3,XC3,XD0
+ VX XB2,XB2,XC3
+ VERLLF XB2,XB2,7
+
+ VAF XC0,XC0,XD1
+ VX XB3,XB3,XC0
+ VERLLF XB3,XB3,7
+
+ VAF XC1,XC1,XD2
+ VX XB0,XB0,XC1
+ VERLLF XB0,XB0,7
+ brct %r0,.Loop_4x
+
+ VAF XD0,XD0,CTR
+
+ VMRHF XT0,XA0,XA1 # transpose data
+ VMRHF XT1,XA2,XA3
+ VMRLF XT2,XA0,XA1
+ VMRLF XT3,XA2,XA3
+ VPDI XA0,XT0,XT1,0b0000
+ VPDI XA1,XT0,XT1,0b0101
+ VPDI XA2,XT2,XT3,0b0000
+ VPDI XA3,XT2,XT3,0b0101
+
+ VMRHF XT0,XB0,XB1
+ VMRHF XT1,XB2,XB3
+ VMRLF XT2,XB0,XB1
+ VMRLF XT3,XB2,XB3
+ VPDI XB0,XT0,XT1,0b0000
+ VPDI XB1,XT0,XT1,0b0101
+ VPDI XB2,XT2,XT3,0b0000
+ VPDI XB3,XT2,XT3,0b0101
+
+ VMRHF XT0,XC0,XC1
+ VMRHF XT1,XC2,XC3
+ VMRLF XT2,XC0,XC1
+ VMRLF XT3,XC2,XC3
+ VPDI XC0,XT0,XT1,0b0000
+ VPDI XC1,XT0,XT1,0b0101
+ VPDI XC2,XT2,XT3,0b0000
+ VPDI XC3,XT2,XT3,0b0101
+
+ VMRHF XT0,XD0,XD1
+ VMRHF XT1,XD2,XD3
+ VMRLF XT2,XD0,XD1
+ VMRLF XT3,XD2,XD3
+ VPDI XD0,XT0,XT1,0b0000
+ VPDI XD1,XT0,XT1,0b0101
+ VPDI XD2,XT2,XT3,0b0000
+ VPDI XD3,XT2,XT3,0b0101
+
+ VAF XA0,XA0,K0
+ VAF XB0,XB0,K1
+ VAF XC0,XC0,K2
+ VAF XD0,XD0,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+
+ VAF XA0,XA1,K0
+ VAF XB0,XB1,K1
+ VAF XC0,XC1,K2
+ VAF XD0,XD1,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_4x
+
+ VAF XA0,XA2,K0
+ VAF XB0,XB2,K1
+ VAF XC0,XC2,K2
+ VAF XD0,XD2,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_4x
+
+ VAF XA0,XA3,K0
+ VAF XB0,XB3,K1
+ VAF XC0,XC3,K2
+ VAF XD0,XD3,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+.Ldone_4x:
+ lmg %r6,%r7,6*8(SP)
+ BR_EX %r14
+
+.Ltail_4x:
+ VLR XT0,XC0
+ VLR XT1,XD0
+
+ VST XA0,8*8+0x00,,SP
+ VST XB0,8*8+0x10,,SP
+ VST XT0,8*8+0x20,,SP
+ VST XT1,8*8+0x30,,SP
+
+ lghi %r1,0
+
+.Loop_tail_4x:
+ llgc %r5,0(%r1,INP)
+ llgc %r6,8*8(%r1,SP)
+ xr %r6,%r5
+ stc %r6,0(%r1,OUT)
+ la %r1,1(%r1)
+ brct LEN,.Loop_tail_4x
+
+ lmg %r6,%r7,6*8(SP)
+ BR_EX %r14
+SYM_FUNC_END(chacha20_vx_4x)
+
+#undef OUT
+#undef INP
+#undef LEN
+#undef KEY
+#undef COUNTER
+
+#undef BEPERM
+
+#undef K0
+#undef K1
+#undef K2
+#undef K3
+
+
+#############################################################################
+# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
+# counst u32 *key, const u32 *counter)
+
+#define OUT %r2
+#define INP %r3
+#define LEN %r4
+#define KEY %r5
+#define COUNTER %r6
+
+#define BEPERM %v31
+
+#define K0 %v27
+#define K1 %v24
+#define K2 %v25
+#define K3 %v26
+
+#define A0 %v0
+#define B0 %v1
+#define C0 %v2
+#define D0 %v3
+
+#define A1 %v4
+#define B1 %v5
+#define C1 %v6
+#define D1 %v7
+
+#define A2 %v8
+#define B2 %v9
+#define C2 %v10
+#define D2 %v11
+
+#define A3 %v12
+#define B3 %v13
+#define C3 %v14
+#define D3 %v15
+
+#define A4 %v16
+#define B4 %v17
+#define C4 %v18
+#define D4 %v19
+
+#define A5 %v20
+#define B5 %v21
+#define C5 %v22
+#define D5 %v23
+
+#define T0 %v27
+#define T1 %v28
+#define T2 %v29
+#define T3 %v30
+
+SYM_FUNC_START(chacha20_vx)
+ clgfi LEN,256
+ jle chacha20_vx_4x
+ stmg %r6,%r7,6*8(SP)
+
+ lghi %r1,-FRAME
+ lgr %r0,SP
+ la SP,0(%r1,SP)
+ stg %r0,0(SP) # back-chain
+
+ larl %r7,sigma
+ lhi %r0,10
+
+ VLM K1,K2,0,KEY,0 # load key
+ VL K3,0,,COUNTER # load counter
+
+ VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ...
+
+.Loop_outer_vx:
+ VLR A0,K0
+ VLR B0,K1
+ VLR A1,K0
+ VLR B1,K1
+ VLR A2,K0
+ VLR B2,K1
+ VLR A3,K0
+ VLR B3,K1
+ VLR A4,K0
+ VLR B4,K1
+ VLR A5,K0
+ VLR B5,K1
+
+ VLR D0,K3
+ VAF D1,K3,T1 # K[3]+1
+ VAF D2,K3,T2 # K[3]+2
+ VAF D3,K3,T3 # K[3]+3
+ VAF D4,D2,T2 # K[3]+4
+ VAF D5,D2,T3 # K[3]+5
+
+ VLR C0,K2
+ VLR C1,K2
+ VLR C2,K2
+ VLR C3,K2
+ VLR C4,K2
+ VLR C5,K2
+
+ VLR T1,D1
+ VLR T2,D2
+ VLR T3,D3
+
+.Loop_vx:
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,16
+ VERLLF D1,D1,16
+ VERLLF D2,D2,16
+ VERLLF D3,D3,16
+ VERLLF D4,D4,16
+ VERLLF D5,D5,16
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,12
+ VERLLF B1,B1,12
+ VERLLF B2,B2,12
+ VERLLF B3,B3,12
+ VERLLF B4,B4,12
+ VERLLF B5,B5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,8
+ VERLLF D1,D1,8
+ VERLLF D2,D2,8
+ VERLLF D3,D3,8
+ VERLLF D4,D4,8
+ VERLLF D5,D5,8
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,7
+ VERLLF B1,B1,7
+ VERLLF B2,B2,7
+ VERLLF B3,B3,7
+ VERLLF B4,B4,7
+ VERLLF B5,B5,7
+
+ VSLDB C0,C0,C0,8
+ VSLDB C1,C1,C1,8
+ VSLDB C2,C2,C2,8
+ VSLDB C3,C3,C3,8
+ VSLDB C4,C4,C4,8
+ VSLDB C5,C5,C5,8
+ VSLDB B0,B0,B0,4
+ VSLDB B1,B1,B1,4
+ VSLDB B2,B2,B2,4
+ VSLDB B3,B3,B3,4
+ VSLDB B4,B4,B4,4
+ VSLDB B5,B5,B5,4
+ VSLDB D0,D0,D0,12
+ VSLDB D1,D1,D1,12
+ VSLDB D2,D2,D2,12
+ VSLDB D3,D3,D3,12
+ VSLDB D4,D4,D4,12
+ VSLDB D5,D5,D5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,16
+ VERLLF D1,D1,16
+ VERLLF D2,D2,16
+ VERLLF D3,D3,16
+ VERLLF D4,D4,16
+ VERLLF D5,D5,16
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,12
+ VERLLF B1,B1,12
+ VERLLF B2,B2,12
+ VERLLF B3,B3,12
+ VERLLF B4,B4,12
+ VERLLF B5,B5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,8
+ VERLLF D1,D1,8
+ VERLLF D2,D2,8
+ VERLLF D3,D3,8
+ VERLLF D4,D4,8
+ VERLLF D5,D5,8
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,7
+ VERLLF B1,B1,7
+ VERLLF B2,B2,7
+ VERLLF B3,B3,7
+ VERLLF B4,B4,7
+ VERLLF B5,B5,7
+
+ VSLDB C0,C0,C0,8
+ VSLDB C1,C1,C1,8
+ VSLDB C2,C2,C2,8
+ VSLDB C3,C3,C3,8
+ VSLDB C4,C4,C4,8
+ VSLDB C5,C5,C5,8
+ VSLDB B0,B0,B0,12
+ VSLDB B1,B1,B1,12
+ VSLDB B2,B2,B2,12
+ VSLDB B3,B3,B3,12
+ VSLDB B4,B4,B4,12
+ VSLDB B5,B5,B5,12
+ VSLDB D0,D0,D0,4
+ VSLDB D1,D1,D1,4
+ VSLDB D2,D2,D2,4
+ VSLDB D3,D3,D3,4
+ VSLDB D4,D4,D4,4
+ VSLDB D5,D5,D5,4
+ brct %r0,.Loop_vx
+
+ VAF A0,A0,K0
+ VAF B0,B0,K1
+ VAF C0,C0,K2
+ VAF D0,D0,K3
+ VAF A1,A1,K0
+ VAF D1,D1,T1 # +K[3]+1
+
+ VPERM A0,A0,A0,BEPERM
+ VPERM B0,B0,B0,BEPERM
+ VPERM C0,C0,C0,BEPERM
+ VPERM D0,D0,D0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VAF D2,D2,T2 # +K[3]+2
+ VAF D3,D3,T3 # +K[3]+3
+ VLM T0,T3,0,INP,0
+
+ VX A0,A0,T0
+ VX B0,B0,T1
+ VX C0,C0,T2
+ VX D0,D0,T3
+
+ VLM K0,T3,0,%r7,4 # re-load sigma and increments
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF B1,B1,K1
+ VAF C1,C1,K2
+
+ VPERM A0,A1,A1,BEPERM
+ VPERM B0,B1,B1,BEPERM
+ VPERM C0,C1,C1,BEPERM
+ VPERM D0,D1,D1,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A2,A2,K0
+ VAF B2,B2,K1
+ VAF C2,C2,K2
+
+ VPERM A0,A2,A2,BEPERM
+ VPERM B0,B2,B2,BEPERM
+ VPERM C0,C2,C2,BEPERM
+ VPERM D0,D2,D2,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A3,A3,K0
+ VAF B3,B3,K1
+ VAF C3,C3,K2
+ VAF D2,K3,T3 # K[3]+3
+
+ VPERM A0,A3,A3,BEPERM
+ VPERM B0,B3,B3,BEPERM
+ VPERM C0,C3,C3,BEPERM
+ VPERM D0,D3,D3,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VAF D3,D2,T1 # K[3]+4
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A4,A4,K0
+ VAF B4,B4,K1
+ VAF C4,C4,K2
+ VAF D4,D4,D3 # +K[3]+4
+ VAF D3,D3,T1 # K[3]+5
+ VAF K3,D2,T3 # K[3]+=6
+
+ VPERM A0,A4,A4,BEPERM
+ VPERM B0,B4,B4,BEPERM
+ VPERM C0,C4,C4,BEPERM
+ VPERM D0,D4,D4,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A5,A5,K0
+ VAF B5,B5,K1
+ VAF C5,C5,K2
+ VAF D5,D5,D3 # +K[3]+5
+
+ VPERM A0,A5,A5,BEPERM
+ VPERM B0,B5,B5,BEPERM
+ VPERM C0,C5,C5,BEPERM
+ VPERM D0,D5,D5,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ lhi %r0,10
+ aghi LEN,-0x40
+ jne .Loop_outer_vx
+
+.Ldone_vx:
+ lmg %r6,%r7,FRAME+6*8(SP)
+ la SP,FRAME(SP)
+ BR_EX %r14
+
+.Ltail_vx:
+ VSTM A0,D0,8*8,SP,3
+ lghi %r1,0
+
+.Loop_tail_vx:
+ llgc %r5,0(%r1,INP)
+ llgc %r6,8*8(%r1,SP)
+ xr %r6,%r5
+ stc %r6,0(%r1,OUT)
+ la %r1,1(%r1)
+ brct LEN,.Loop_tail_vx
+
+ lmg %r6,%r7,FRAME+6*8(SP)
+ la SP,FRAME(SP)
+ BR_EX %r14
+SYM_FUNC_END(chacha20_vx)
+
+.previous
diff --git a/lib/crypto/s390/chacha-s390.h b/lib/crypto/s390/chacha-s390.h
new file mode 100644
index 000000000000..733744ce30f5
--- /dev/null
+++ b/lib/crypto/s390/chacha-s390.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#ifndef _CHACHA_S390_H
+#define _CHACHA_S390_H
+
+void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
+ const u32 *counter);
+
+#endif /* _CHACHA_S390_H */
diff --git a/lib/crypto/s390/chacha.h b/lib/crypto/s390/chacha.h
new file mode 100644
index 000000000000..fd9c4a422365
--- /dev/null
+++ b/lib/crypto/s390/chacha.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ChaCha stream cipher (s390 optimized)
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+#include <asm/fpu.h>
+#include "chacha-s390.h"
+
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ /* s390 chacha20 implementation has 20 rounds hard-coded,
+ * it cannot handle a block of data or less, but otherwise
+ * it can handle data of arbitrary size
+ */
+ if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) {
+ chacha_crypt_generic(state, dst, src, bytes, nrounds);
+ } else {
+ DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
+
+ kernel_fpu_begin(&vxstate, KERNEL_VXR);
+ chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]);
+ kernel_fpu_end(&vxstate, KERNEL_VXR);
+
+ state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) /
+ CHACHA_BLOCK_SIZE;
+ }
+}
diff --git a/lib/crypto/s390/sha1.h b/lib/crypto/s390/sha1.h
new file mode 100644
index 000000000000..73d94476a157
--- /dev/null
+++ b/lib/crypto/s390/sha1.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha1);
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_cpacf_sha1))
+ cpacf_kimd(CPACF_KIMD_SHA_1, state, data,
+ nblocks * SHA1_BLOCK_SIZE);
+ else
+ sha1_blocks_generic(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+ if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+ cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1))
+ static_branch_enable(&have_cpacf_sha1);
+}
diff --git a/lib/crypto/s390/sha256.h b/lib/crypto/s390/sha256.h
new file mode 100644
index 000000000000..acd483508789
--- /dev/null
+++ b/lib/crypto/s390/sha256.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_cpacf_sha256))
+ cpacf_kimd(CPACF_KIMD_SHA_256, state, data,
+ nblocks * SHA256_BLOCK_SIZE);
+ else
+ sha256_blocks_generic(state, data, nblocks);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+ cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
+ static_branch_enable(&have_cpacf_sha256);
+}
diff --git a/lib/crypto/s390/sha3.h b/lib/crypto/s390/sha3.h
new file mode 100644
index 000000000000..85471404775a
--- /dev/null
+++ b/lib/crypto/s390/sha3.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-3 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3_init_optim);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+ size_t nblocks, size_t block_size)
+{
+ if (static_branch_likely(&have_sha3)) {
+ /*
+ * Note that KIMD assumes little-endian order of the state
+ * words. sha3_state already uses that order, though, so
+ * there's no need for a byteswap.
+ */
+ switch (block_size) {
+ case SHA3_224_BLOCK_SIZE:
+ cpacf_kimd(CPACF_KIMD_SHA3_224, state,
+ data, nblocks * block_size);
+ return;
+ case SHA3_256_BLOCK_SIZE:
+ /*
+ * This case handles both SHA3-256 and SHAKE256, since
+ * they have the same block size.
+ */
+ cpacf_kimd(CPACF_KIMD_SHA3_256, state,
+ data, nblocks * block_size);
+ return;
+ case SHA3_384_BLOCK_SIZE:
+ cpacf_kimd(CPACF_KIMD_SHA3_384, state,
+ data, nblocks * block_size);
+ return;
+ case SHA3_512_BLOCK_SIZE:
+ cpacf_kimd(CPACF_KIMD_SHA3_512, state,
+ data, nblocks * block_size);
+ return;
+ }
+ }
+ sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+ if (static_branch_likely(&have_sha3)) {
+ /*
+ * Passing zeroes into any of CPACF_KIMD_SHA3_* gives the plain
+ * Keccak-f permutation, which is what we want here. Use
+ * SHA3-512 since it has the smallest block size.
+ */
+ static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+ cpacf_kimd(CPACF_KIMD_SHA3_512, state, zeroes, sizeof(zeroes));
+ } else {
+ sha3_keccakf_generic(state);
+ }
+}
+
+static inline bool s390_sha3(int func, const u8 *in, size_t in_len,
+ u8 *out, size_t out_len)
+{
+ struct sha3_state state;
+
+ if (!static_branch_likely(&have_sha3))
+ return false;
+
+ if (static_branch_likely(&have_sha3_init_optim))
+ func |= CPACF_KLMD_NIP | CPACF_KLMD_DUFOP;
+ else
+ memset(&state, 0, sizeof(state));
+
+ cpacf_klmd(func, &state, in, in_len);
+
+ if (static_branch_likely(&have_sha3_init_optim))
+ kmsan_unpoison_memory(&state, out_len);
+
+ memcpy(out, &state, out_len);
+ memzero_explicit(&state, sizeof(state));
+ return true;
+}
+
+#define sha3_224_arch sha3_224_arch
+static bool sha3_224_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_224_DIGEST_SIZE])
+{
+ return s390_sha3(CPACF_KLMD_SHA3_224, in, in_len,
+ out, SHA3_224_DIGEST_SIZE);
+}
+
+#define sha3_256_arch sha3_256_arch
+static bool sha3_256_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_256_DIGEST_SIZE])
+{
+ return s390_sha3(CPACF_KLMD_SHA3_256, in, in_len,
+ out, SHA3_256_DIGEST_SIZE);
+}
+
+#define sha3_384_arch sha3_384_arch
+static bool sha3_384_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_384_DIGEST_SIZE])
+{
+ return s390_sha3(CPACF_KLMD_SHA3_384, in, in_len,
+ out, SHA3_384_DIGEST_SIZE);
+}
+
+#define sha3_512_arch sha3_512_arch
+static bool sha3_512_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_512_DIGEST_SIZE])
+{
+ return s390_sha3(CPACF_KLMD_SHA3_512, in, in_len,
+ out, SHA3_512_DIGEST_SIZE);
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+ int num_present = 0;
+ int num_possible = 0;
+
+ if (!cpu_have_feature(S390_CPU_FEATURE_MSA))
+ return;
+ /*
+ * Since all the SHA-3 functions are in Message-Security-Assist
+ * Extension 6, just treat them as all or nothing. This way we need
+ * only one static_key.
+ */
+#define QUERY(opcode, func) \
+ ({ num_present += !!cpacf_query_func(opcode, func); num_possible++; })
+ QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_224);
+ QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_256);
+ QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_384);
+ QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_512);
+ QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_224);
+ QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_256);
+ QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_384);
+ QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_512);
+#undef QUERY
+
+ if (num_present == num_possible) {
+ static_branch_enable(&have_sha3);
+ if (test_facility(86))
+ static_branch_enable(&have_sha3_init_optim);
+ } else if (num_present != 0) {
+ pr_warn("Unsupported combination of SHA-3 facilities\n");
+ }
+}
diff --git a/lib/crypto/s390/sha512.h b/lib/crypto/s390/sha512.h
new file mode 100644
index 000000000000..46699d43df7e
--- /dev/null
+++ b/lib/crypto/s390/sha512.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-512 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha512);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_cpacf_sha512))
+ cpacf_kimd(CPACF_KIMD_SHA_512, state, data,
+ nblocks * SHA512_BLOCK_SIZE);
+ else
+ sha512_blocks_generic(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+ cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512))
+ static_branch_enable(&have_cpacf_sha512);
+}
diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c
index ebb60519ae93..52788278cd17 100644
--- a/lib/crypto/sha1.c
+++ b/lib/crypto/sha1.c
@@ -1,18 +1,22 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * SHA1 routine optimized to do word accesses rather than byte accesses,
- * and to avoid unnecessary copies into the context array.
- *
- * This was based on the git SHA1 implementation.
+ * SHA-1 and HMAC-SHA1 library functions
*/
-#include <linux/kernel.h>
+#include <crypto/hmac.h>
+#include <crypto/sha1.h>
+#include <linux/bitops.h>
#include <linux/export.h>
+#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/bitops.h>
#include <linux/string.h>
-#include <crypto/sha1.h>
#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+#include "fips.h"
+
+static const struct sha1_block_state sha1_iv = {
+ .h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+};
/*
* If you have 32 registers or more, the compiler can (and should)
@@ -124,10 +128,10 @@ void sha1_transform(__u32 *digest, const char *data, __u32 *array)
EXPORT_SYMBOL(sha1_transform);
/**
- * sha1_init - initialize the vectors for a SHA1 digest
+ * sha1_init_raw - initialize the vectors for a SHA1 digest
* @buf: vector to initialize
*/
-void sha1_init(__u32 *buf)
+void sha1_init_raw(__u32 *buf)
{
buf[0] = 0x67452301;
buf[1] = 0xefcdab89;
@@ -135,7 +139,227 @@ void sha1_init(__u32 *buf)
buf[3] = 0x10325476;
buf[4] = 0xc3d2e1f0;
}
-EXPORT_SYMBOL(sha1_init);
+EXPORT_SYMBOL(sha1_init_raw);
+
+static void __maybe_unused sha1_blocks_generic(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ u32 workspace[SHA1_WORKSPACE_WORDS];
+
+ do {
+ sha1_transform(state->h, data, workspace);
+ data += SHA1_BLOCK_SIZE;
+ } while (--nblocks);
+
+ memzero_explicit(workspace, sizeof(workspace));
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA1_ARCH
+#include "sha1.h" /* $(SRCARCH)/sha1.h */
+#else
+#define sha1_blocks sha1_blocks_generic
+#endif
+
+void sha1_init(struct sha1_ctx *ctx)
+{
+ ctx->state = sha1_iv;
+ ctx->bytecount = 0;
+}
+EXPORT_SYMBOL_GPL(sha1_init);
+
+void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len)
+{
+ size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;
+
+ ctx->bytecount += len;
+
+ if (partial + len >= SHA1_BLOCK_SIZE) {
+ size_t nblocks;
+
+ if (partial) {
+ size_t l = SHA1_BLOCK_SIZE - partial;
+
+ memcpy(&ctx->buf[partial], data, l);
+ data += l;
+ len -= l;
+
+ sha1_blocks(&ctx->state, ctx->buf, 1);
+ }
+
+ nblocks = len / SHA1_BLOCK_SIZE;
+ len %= SHA1_BLOCK_SIZE;
+
+ if (nblocks) {
+ sha1_blocks(&ctx->state, data, nblocks);
+ data += nblocks * SHA1_BLOCK_SIZE;
+ }
+ partial = 0;
+ }
+ if (len)
+ memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(sha1_update);
+
+static void __sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+ u64 bitcount = ctx->bytecount << 3;
+ size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;
+
+ ctx->buf[partial++] = 0x80;
+ if (partial > SHA1_BLOCK_SIZE - 8) {
+ memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - partial);
+ sha1_blocks(&ctx->state, ctx->buf, 1);
+ partial = 0;
+ }
+ memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - 8 - partial);
+ *(__be64 *)&ctx->buf[SHA1_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
+ sha1_blocks(&ctx->state, ctx->buf, 1);
+
+ for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
+ put_unaligned_be32(ctx->state.h[i / 4], out + i);
+}
+
+void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+ __sha1_final(ctx, out);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha1_final);
+
+void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE])
+{
+ struct sha1_ctx ctx;
+
+ sha1_init(&ctx);
+ sha1_update(&ctx, data, len);
+ sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha1);
+
+static void __hmac_sha1_preparekey(struct sha1_block_state *istate,
+ struct sha1_block_state *ostate,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ union {
+ u8 b[SHA1_BLOCK_SIZE];
+ unsigned long w[SHA1_BLOCK_SIZE / sizeof(unsigned long)];
+ } derived_key = { 0 };
+
+ if (unlikely(raw_key_len > SHA1_BLOCK_SIZE))
+ sha1(raw_key, raw_key_len, derived_key.b);
+ else
+ memcpy(derived_key.b, raw_key, raw_key_len);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+ *istate = sha1_iv;
+ sha1_blocks(istate, derived_key.b, 1);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+ HMAC_IPAD_VALUE);
+ *ostate = sha1_iv;
+ sha1_blocks(ostate, derived_key.b, 1);
+
+ memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha1_preparekey(struct hmac_sha1_key *key,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha1_preparekey(&key->istate, &key->ostate,
+ raw_key, raw_key_len);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_preparekey);
+
+void hmac_sha1_init(struct hmac_sha1_ctx *ctx, const struct hmac_sha1_key *key)
+{
+ ctx->sha_ctx.state = key->istate;
+ ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
+ ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_init);
+
+void hmac_sha1_init_usingrawkey(struct hmac_sha1_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha1_preparekey(&ctx->sha_ctx.state, &ctx->ostate,
+ raw_key, raw_key_len);
+ ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_init_usingrawkey);
+
+void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+ /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+ __sha1_final(&ctx->sha_ctx, ctx->sha_ctx.buf);
+ memset(&ctx->sha_ctx.buf[SHA1_DIGEST_SIZE], 0,
+ SHA1_BLOCK_SIZE - SHA1_DIGEST_SIZE);
+ ctx->sha_ctx.buf[SHA1_DIGEST_SIZE] = 0x80;
+ *(__be32 *)&ctx->sha_ctx.buf[SHA1_BLOCK_SIZE - 4] =
+ cpu_to_be32(8 * (SHA1_BLOCK_SIZE + SHA1_DIGEST_SIZE));
+
+ /* Compute the outer hash, which gives the HMAC value. */
+ sha1_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+ for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
+ put_unaligned_be32(ctx->ostate.h[i / 4], out + i);
+
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_final);
+
+void hmac_sha1(const struct hmac_sha1_key *key,
+ const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE])
+{
+ struct hmac_sha1_ctx ctx;
+
+ hmac_sha1_init(&ctx, key);
+ hmac_sha1_update(&ctx, data, data_len);
+ hmac_sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1);
+
+void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[SHA1_DIGEST_SIZE])
+{
+ struct hmac_sha1_ctx ctx;
+
+ hmac_sha1_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_sha1_update(&ctx, data, data_len);
+ hmac_sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey);
+
+#if defined(sha1_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha1_mod_init(void)
+{
+#ifdef sha1_mod_init_arch
+ sha1_mod_init_arch();
+#endif
+ if (fips_enabled) {
+ /*
+ * FIPS cryptographic algorithm self-test. As per the FIPS
+ * Implementation Guidance, testing HMAC-SHA1 satisfies the test
+ * requirement for SHA-1 too.
+ */
+ u8 mac[SHA1_DIGEST_SIZE];
+
+ hmac_sha1_usingrawkey(fips_test_key, sizeof(fips_test_key),
+ fips_test_data, sizeof(fips_test_data),
+ mac);
+ if (memcmp(fips_test_hmac_sha1_value, mac, sizeof(mac)) != 0)
+ panic("sha1: FIPS self-test failed\n");
+ }
+ return 0;
+}
+subsys_initcall(sha1_mod_init);
+
+static void __exit sha1_mod_exit(void)
+{
+}
+module_exit(sha1_mod_exit);
+#endif
-MODULE_DESCRIPTION("SHA-1 Algorithm");
+MODULE_DESCRIPTION("SHA-1 and HMAC-SHA1 library functions");
MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha256-generic.c b/lib/crypto/sha256-generic.c
deleted file mode 100644
index a16ad4f25ebb..000000000000
--- a/lib/crypto/sha256-generic.c
+++ /dev/null
@@ -1,137 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256, as specified in
- * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
- *
- * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
- *
- * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- * Copyright (c) 2014 Red Hat Inc.
- */
-
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-static const u32 SHA256_K[] = {
- 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
- 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
- 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
- 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
- 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
- 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
- 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
- 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
- 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
- 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
- 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
- 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
- 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
- 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
- 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
- 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
-};
-
-static inline u32 Ch(u32 x, u32 y, u32 z)
-{
- return z ^ (x & (y ^ z));
-}
-
-static inline u32 Maj(u32 x, u32 y, u32 z)
-{
- return (x & y) | (z & (x | y));
-}
-
-#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22))
-#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25))
-#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3))
-#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10))
-
-static inline void LOAD_OP(int I, u32 *W, const u8 *input)
-{
- W[I] = get_unaligned_be32((__u32 *)input + I);
-}
-
-static inline void BLEND_OP(int I, u32 *W)
-{
- W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
-}
-
-#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) do { \
- u32 t1, t2; \
- t1 = h + e1(e) + Ch(e, f, g) + SHA256_K[i] + W[i]; \
- t2 = e0(a) + Maj(a, b, c); \
- d += t1; \
- h = t1 + t2; \
-} while (0)
-
-static void sha256_block_generic(u32 state[SHA256_STATE_WORDS],
- const u8 *input, u32 W[64])
-{
- u32 a, b, c, d, e, f, g, h;
- int i;
-
- /* load the input */
- for (i = 0; i < 16; i += 8) {
- LOAD_OP(i + 0, W, input);
- LOAD_OP(i + 1, W, input);
- LOAD_OP(i + 2, W, input);
- LOAD_OP(i + 3, W, input);
- LOAD_OP(i + 4, W, input);
- LOAD_OP(i + 5, W, input);
- LOAD_OP(i + 6, W, input);
- LOAD_OP(i + 7, W, input);
- }
-
- /* now blend */
- for (i = 16; i < 64; i += 8) {
- BLEND_OP(i + 0, W);
- BLEND_OP(i + 1, W);
- BLEND_OP(i + 2, W);
- BLEND_OP(i + 3, W);
- BLEND_OP(i + 4, W);
- BLEND_OP(i + 5, W);
- BLEND_OP(i + 6, W);
- BLEND_OP(i + 7, W);
- }
-
- /* load the state into our registers */
- a = state[0]; b = state[1]; c = state[2]; d = state[3];
- e = state[4]; f = state[5]; g = state[6]; h = state[7];
-
- /* now iterate */
- for (i = 0; i < 64; i += 8) {
- SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
- SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
- SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
- SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
- SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
- SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
- SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
- SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
- }
-
- state[0] += a; state[1] += b; state[2] += c; state[3] += d;
- state[4] += e; state[5] += f; state[6] += g; state[7] += h;
-}
-
-void sha256_blocks_generic(u32 state[SHA256_STATE_WORDS],
- const u8 *data, size_t nblocks)
-{
- u32 W[64];
-
- do {
- sha256_block_generic(state, data, W);
- data += SHA256_BLOCK_SIZE;
- } while (--nblocks);
-
- memzero_explicit(W, sizeof(W));
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_generic);
-
-MODULE_DESCRIPTION("SHA-256 Algorithm (generic implementation)");
-MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 107e5162507a..5d6b77e7e141 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -1,83 +1,515 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * SHA-256, as specified in
- * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
- *
- * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
+ * SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions
*
* Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
* Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
* Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
* Copyright (c) 2014 Red Hat Inc.
+ * Copyright 2025 Google LLC
*/
-#include <crypto/internal/blockhash.h>
-#include <crypto/internal/sha2.h>
+#include <crypto/hmac.h>
+#include <crypto/sha2.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+#include "fips.h"
-/*
- * If __DISABLE_EXPORTS is defined, then this file is being compiled for a
- * pre-boot environment. In that case, ignore the kconfig options, pull the
- * generic code into the same translation unit, and use that only.
- */
-#ifdef __DISABLE_EXPORTS
-#include "sha256-generic.c"
+static const struct sha256_block_state sha224_iv = {
+ .h = {
+ SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
+ SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
+ },
+};
+
+static const struct sha256_ctx initial_sha256_ctx = {
+ .ctx = {
+ .state = {
+ .h = {
+ SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+ SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+ },
+ },
+ .bytecount = 0,
+ },
+};
+
+#define sha256_iv (initial_sha256_ctx.ctx.state)
+
+static const u32 sha256_K[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
+ 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
+ 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+ 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
+ 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
+ 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
+
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
+#define e0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22))
+#define e1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25))
+#define s0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3))
+#define s1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10))
+
+static inline void LOAD_OP(int I, u32 *W, const u8 *input)
+{
+ W[I] = get_unaligned_be32((__u32 *)input + I);
+}
+
+static inline void BLEND_OP(int I, u32 *W)
+{
+ W[I] = s1(W[I - 2]) + W[I - 7] + s0(W[I - 15]) + W[I - 16];
+}
+
+#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) \
+ do { \
+ u32 t1, t2; \
+ t1 = h + e1(e) + Ch(e, f, g) + sha256_K[i] + W[i]; \
+ t2 = e0(a) + Maj(a, b, c); \
+ d += t1; \
+ h = t1 + t2; \
+ } while (0)
+
+static void sha256_block_generic(struct sha256_block_state *state,
+ const u8 *input, u32 W[64])
+{
+ u32 a, b, c, d, e, f, g, h;
+ int i;
+
+ /* load the input */
+ for (i = 0; i < 16; i += 8) {
+ LOAD_OP(i + 0, W, input);
+ LOAD_OP(i + 1, W, input);
+ LOAD_OP(i + 2, W, input);
+ LOAD_OP(i + 3, W, input);
+ LOAD_OP(i + 4, W, input);
+ LOAD_OP(i + 5, W, input);
+ LOAD_OP(i + 6, W, input);
+ LOAD_OP(i + 7, W, input);
+ }
+
+ /* now blend */
+ for (i = 16; i < 64; i += 8) {
+ BLEND_OP(i + 0, W);
+ BLEND_OP(i + 1, W);
+ BLEND_OP(i + 2, W);
+ BLEND_OP(i + 3, W);
+ BLEND_OP(i + 4, W);
+ BLEND_OP(i + 5, W);
+ BLEND_OP(i + 6, W);
+ BLEND_OP(i + 7, W);
+ }
+
+ /* load the state into our registers */
+ a = state->h[0];
+ b = state->h[1];
+ c = state->h[2];
+ d = state->h[3];
+ e = state->h[4];
+ f = state->h[5];
+ g = state->h[6];
+ h = state->h[7];
+
+ /* now iterate */
+ for (i = 0; i < 64; i += 8) {
+ SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
+ SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
+ SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
+ SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
+ SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
+ SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
+ SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
+ SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
+ }
+
+ state->h[0] += a;
+ state->h[1] += b;
+ state->h[2] += c;
+ state->h[3] += d;
+ state->h[4] += e;
+ state->h[5] += f;
+ state->h[6] += g;
+ state->h[7] += h;
+}
+
+static void __maybe_unused
+sha256_blocks_generic(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ u32 W[64];
+
+ do {
+ sha256_block_generic(state, data, W);
+ data += SHA256_BLOCK_SIZE;
+ } while (--nblocks);
+
+ memzero_explicit(W, sizeof(W));
+}
+
+#if defined(CONFIG_CRYPTO_LIB_SHA256_ARCH) && !defined(__DISABLE_EXPORTS)
+#include "sha256.h" /* $(SRCARCH)/sha256.h */
+#else
+#define sha256_blocks sha256_blocks_generic
#endif
-static inline bool sha256_purgatory(void)
+static void __sha256_init(struct __sha256_ctx *ctx,
+ const struct sha256_block_state *iv,
+ u64 initial_bytecount)
+{
+ ctx->state = *iv;
+ ctx->bytecount = initial_bytecount;
+}
+
+void sha224_init(struct sha224_ctx *ctx)
{
- return __is_defined(__DISABLE_EXPORTS);
+ __sha256_init(&ctx->ctx, &sha224_iv, 0);
}
+EXPORT_SYMBOL_GPL(sha224_init);
-static inline void sha256_blocks(u32 state[SHA256_STATE_WORDS], const u8 *data,
- size_t nblocks)
+void sha256_init(struct sha256_ctx *ctx)
{
- sha256_choose_blocks(state, data, nblocks, sha256_purgatory(), false);
+ __sha256_init(&ctx->ctx, &sha256_iv, 0);
}
+EXPORT_SYMBOL_GPL(sha256_init);
-void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len)
+void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len)
{
- size_t partial = sctx->count % SHA256_BLOCK_SIZE;
+ size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;
- sctx->count += len;
- BLOCK_HASH_UPDATE_BLOCKS(sha256_blocks, sctx->ctx.state, data, len,
- SHA256_BLOCK_SIZE, sctx->buf, partial);
+ ctx->bytecount += len;
+
+ if (partial + len >= SHA256_BLOCK_SIZE) {
+ size_t nblocks;
+
+ if (partial) {
+ size_t l = SHA256_BLOCK_SIZE - partial;
+
+ memcpy(&ctx->buf[partial], data, l);
+ data += l;
+ len -= l;
+
+ sha256_blocks(&ctx->state, ctx->buf, 1);
+ }
+
+ nblocks = len / SHA256_BLOCK_SIZE;
+ len %= SHA256_BLOCK_SIZE;
+
+ if (nblocks) {
+ sha256_blocks(&ctx->state, data, nblocks);
+ data += nblocks * SHA256_BLOCK_SIZE;
+ }
+ partial = 0;
+ }
+ if (len)
+ memcpy(&ctx->buf[partial], data, len);
}
-EXPORT_SYMBOL(sha256_update);
+EXPORT_SYMBOL(__sha256_update);
-static inline void __sha256_final(struct sha256_state *sctx, u8 *out,
- size_t digest_size)
+static void __sha256_final(struct __sha256_ctx *ctx,
+ u8 *out, size_t digest_size)
{
- size_t partial = sctx->count % SHA256_BLOCK_SIZE;
+ u64 bitcount = ctx->bytecount << 3;
+ size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;
+
+ ctx->buf[partial++] = 0x80;
+ if (partial > SHA256_BLOCK_SIZE - 8) {
+ memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - partial);
+ sha256_blocks(&ctx->state, ctx->buf, 1);
+ partial = 0;
+ }
+ memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - 8 - partial);
+ *(__be64 *)&ctx->buf[SHA256_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
+ sha256_blocks(&ctx->state, ctx->buf, 1);
+
+ for (size_t i = 0; i < digest_size; i += 4)
+ put_unaligned_be32(ctx->state.h[i / 4], out + i);
+}
- sha256_finup(&sctx->ctx, sctx->buf, partial, out, digest_size,
- sha256_purgatory(), false);
- memzero_explicit(sctx, sizeof(*sctx));
+void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE])
+{
+ __sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
+ memzero_explicit(ctx, sizeof(*ctx));
}
+EXPORT_SYMBOL(sha224_final);
-void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE])
+void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE])
{
- __sha256_final(sctx, out, SHA256_DIGEST_SIZE);
+ __sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
+ memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL(sha256_final);
-void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE])
+void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE])
{
- __sha256_final(sctx, out, SHA224_DIGEST_SIZE);
+ struct sha224_ctx ctx;
+
+ sha224_init(&ctx);
+ sha224_update(&ctx, data, len);
+ sha224_final(&ctx, out);
}
-EXPORT_SYMBOL(sha224_final);
+EXPORT_SYMBOL(sha224);
void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
{
- struct sha256_state sctx;
+ struct sha256_ctx ctx;
- sha256_init(&sctx);
- sha256_update(&sctx, data, len);
- sha256_final(&sctx, out);
+ sha256_init(&ctx);
+ sha256_update(&ctx, data, len);
+ sha256_final(&ctx, out);
}
EXPORT_SYMBOL(sha256);
-MODULE_DESCRIPTION("SHA-256 Algorithm");
+/*
+ * Pre-boot environments (as indicated by __DISABLE_EXPORTS being defined) just
+ * need the generic SHA-256 code. Omit all other features from them.
+ */
+#ifndef __DISABLE_EXPORTS
+
+#ifndef sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, size_t len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ return false;
+}
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+ return false;
+}
+#endif
+
+/* Sequential fallback implementation of sha256_finup_2x() */
+static noinline_for_stack void sha256_finup_2x_sequential(
+ const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
+ size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
+{
+ struct __sha256_ctx mut_ctx;
+
+ mut_ctx = *ctx;
+ __sha256_update(&mut_ctx, data1, len);
+ __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);
+
+ mut_ctx = *ctx;
+ __sha256_update(&mut_ctx, data2, len);
+ __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
+}
+
+void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
+ const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ if (ctx == NULL)
+ ctx = &initial_sha256_ctx;
+
+ if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
+ out2)))
+ return;
+ sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x);
+
+bool sha256_finup_2x_is_optimized(void)
+{
+ return sha256_finup_2x_is_optimized_arch();
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);
+
+static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
+ struct sha256_block_state *ostate,
+ const u8 *raw_key, size_t raw_key_len,
+ const struct sha256_block_state *iv)
+{
+ union {
+ u8 b[SHA256_BLOCK_SIZE];
+ unsigned long w[SHA256_BLOCK_SIZE / sizeof(unsigned long)];
+ } derived_key = { 0 };
+
+ if (unlikely(raw_key_len > SHA256_BLOCK_SIZE)) {
+ if (iv == &sha224_iv)
+ sha224(raw_key, raw_key_len, derived_key.b);
+ else
+ sha256(raw_key, raw_key_len, derived_key.b);
+ } else {
+ memcpy(derived_key.b, raw_key, raw_key_len);
+ }
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+ *istate = *iv;
+ sha256_blocks(istate, derived_key.b, 1);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+ HMAC_IPAD_VALUE);
+ *ostate = *iv;
+ sha256_blocks(ostate, derived_key.b, 1);
+
+ memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha224_preparekey(struct hmac_sha224_key *key,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
+ raw_key, raw_key_len, &sha224_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_preparekey);
+
+void hmac_sha256_preparekey(struct hmac_sha256_key *key,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
+ raw_key, raw_key_len, &sha256_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_preparekey);
+
+void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx,
+ const struct __hmac_sha256_key *key)
+{
+ __sha256_init(&ctx->sha_ctx, &key->istate, SHA256_BLOCK_SIZE);
+ ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(__hmac_sha256_init);
+
+void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+ raw_key, raw_key_len, &sha224_iv);
+ ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_init_usingrawkey);
+
+void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+ raw_key, raw_key_len, &sha256_iv);
+ ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_init_usingrawkey);
+
+static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx,
+ u8 *out, size_t digest_size)
+{
+ /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+ __sha256_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
+ memset(&ctx->sha_ctx.buf[digest_size], 0,
+ SHA256_BLOCK_SIZE - digest_size);
+ ctx->sha_ctx.buf[digest_size] = 0x80;
+ *(__be32 *)&ctx->sha_ctx.buf[SHA256_BLOCK_SIZE - 4] =
+ cpu_to_be32(8 * (SHA256_BLOCK_SIZE + digest_size));
+
+ /* Compute the outer hash, which gives the HMAC value. */
+ sha256_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+ for (size_t i = 0; i < digest_size; i += 4)
+ put_unaligned_be32(ctx->ostate.h[i / 4], out + i);
+
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+
+void hmac_sha224_final(struct hmac_sha224_ctx *ctx,
+ u8 out[SHA224_DIGEST_SIZE])
+{
+ __hmac_sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_final);
+
+void hmac_sha256_final(struct hmac_sha256_ctx *ctx,
+ u8 out[SHA256_DIGEST_SIZE])
+{
+ __hmac_sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_final);
+
+void hmac_sha224(const struct hmac_sha224_key *key,
+ const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE])
+{
+ struct hmac_sha224_ctx ctx;
+
+ hmac_sha224_init(&ctx, key);
+ hmac_sha224_update(&ctx, data, data_len);
+ hmac_sha224_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224);
+
+void hmac_sha256(const struct hmac_sha256_key *key,
+ const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE])
+{
+ struct hmac_sha256_ctx ctx;
+
+ hmac_sha256_init(&ctx, key);
+ hmac_sha256_update(&ctx, data, data_len);
+ hmac_sha256_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256);
+
+void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[SHA224_DIGEST_SIZE])
+{
+ struct hmac_sha224_ctx ctx;
+
+ hmac_sha224_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_sha224_update(&ctx, data, data_len);
+ hmac_sha224_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey);
+
+void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[SHA256_DIGEST_SIZE])
+{
+ struct hmac_sha256_ctx ctx;
+
+ hmac_sha256_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_sha256_update(&ctx, data, data_len);
+ hmac_sha256_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
+
+#if defined(sha256_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha256_mod_init(void)
+{
+#ifdef sha256_mod_init_arch
+ sha256_mod_init_arch();
+#endif
+ if (fips_enabled) {
+ /*
+ * FIPS cryptographic algorithm self-test. As per the FIPS
+ * Implementation Guidance, testing HMAC-SHA256 satisfies the
+ * test requirement for SHA-224, SHA-256, and HMAC-SHA224 too.
+ */
+ u8 mac[SHA256_DIGEST_SIZE];
+
+ hmac_sha256_usingrawkey(fips_test_key, sizeof(fips_test_key),
+ fips_test_data, sizeof(fips_test_data),
+ mac);
+ if (memcmp(fips_test_hmac_sha256_value, mac, sizeof(mac)) != 0)
+ panic("sha256: FIPS self-test failed\n");
+ }
+ return 0;
+}
+subsys_initcall(sha256_mod_init);
+
+static void __exit sha256_mod_exit(void)
+{
+}
+module_exit(sha256_mod_exit);
+#endif
+
+#endif /* !__DISABLE_EXPORTS */
+
+MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha3.c b/lib/crypto/sha3.c
new file mode 100644
index 000000000000..32b7074de792
--- /dev/null
+++ b/lib/crypto/sha3.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-3, as specified in
+ * https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+ *
+ * SHA-3 code by Jeff Garzik <jeff@garzik.org>
+ * Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ * David Howells <dhowells@redhat.com>
+ *
+ * See also Documentation/crypto/sha3.rst
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <crypto/sha3.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+#include "fips.h"
+
+/*
+ * On some 32-bit architectures, such as h8300, GCC ends up using over 1 KB of
+ * stack if the round calculation gets inlined into the loop in
+ * sha3_keccakf_generic(). On the other hand, on 64-bit architectures with
+ * plenty of [64-bit wide] general purpose registers, not inlining it severely
+ * hurts performance. So let's use 64-bitness as a heuristic to decide whether
+ * to inline or not.
+ */
+#ifdef CONFIG_64BIT
+#define SHA3_INLINE inline
+#else
+#define SHA3_INLINE noinline
+#endif
+
+#define SHA3_KECCAK_ROUNDS 24
+
+static const u64 sha3_keccakf_rndc[SHA3_KECCAK_ROUNDS] = {
+ 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
+ 0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
+ 0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
+ 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
+ 0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
+ 0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+ 0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
+ 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+/*
+ * Perform a single round of Keccak mixing.
+ */
+static SHA3_INLINE void sha3_keccakf_one_round_generic(u64 st[25], int round)
+{
+ u64 t[5], tt, bc[5];
+
+ /* Theta */
+ bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+ bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+ bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+ bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+ bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+
+ t[0] = bc[4] ^ rol64(bc[1], 1);
+ t[1] = bc[0] ^ rol64(bc[2], 1);
+ t[2] = bc[1] ^ rol64(bc[3], 1);
+ t[3] = bc[2] ^ rol64(bc[4], 1);
+ t[4] = bc[3] ^ rol64(bc[0], 1);
+
+ st[0] ^= t[0];
+
+ /* Rho Pi */
+ tt = st[1];
+ st[ 1] = rol64(st[ 6] ^ t[1], 44);
+ st[ 6] = rol64(st[ 9] ^ t[4], 20);
+ st[ 9] = rol64(st[22] ^ t[2], 61);
+ st[22] = rol64(st[14] ^ t[4], 39);
+ st[14] = rol64(st[20] ^ t[0], 18);
+ st[20] = rol64(st[ 2] ^ t[2], 62);
+ st[ 2] = rol64(st[12] ^ t[2], 43);
+ st[12] = rol64(st[13] ^ t[3], 25);
+ st[13] = rol64(st[19] ^ t[4], 8);
+ st[19] = rol64(st[23] ^ t[3], 56);
+ st[23] = rol64(st[15] ^ t[0], 41);
+ st[15] = rol64(st[ 4] ^ t[4], 27);
+ st[ 4] = rol64(st[24] ^ t[4], 14);
+ st[24] = rol64(st[21] ^ t[1], 2);
+ st[21] = rol64(st[ 8] ^ t[3], 55);
+ st[ 8] = rol64(st[16] ^ t[1], 45);
+ st[16] = rol64(st[ 5] ^ t[0], 36);
+ st[ 5] = rol64(st[ 3] ^ t[3], 28);
+ st[ 3] = rol64(st[18] ^ t[3], 21);
+ st[18] = rol64(st[17] ^ t[2], 15);
+ st[17] = rol64(st[11] ^ t[1], 10);
+ st[11] = rol64(st[ 7] ^ t[2], 6);
+ st[ 7] = rol64(st[10] ^ t[0], 3);
+ st[10] = rol64( tt ^ t[1], 1);
+
+ /* Chi */
+ bc[ 0] = ~st[ 1] & st[ 2];
+ bc[ 1] = ~st[ 2] & st[ 3];
+ bc[ 2] = ~st[ 3] & st[ 4];
+ bc[ 3] = ~st[ 4] & st[ 0];
+ bc[ 4] = ~st[ 0] & st[ 1];
+ st[ 0] ^= bc[ 0];
+ st[ 1] ^= bc[ 1];
+ st[ 2] ^= bc[ 2];
+ st[ 3] ^= bc[ 3];
+ st[ 4] ^= bc[ 4];
+
+ bc[ 0] = ~st[ 6] & st[ 7];
+ bc[ 1] = ~st[ 7] & st[ 8];
+ bc[ 2] = ~st[ 8] & st[ 9];
+ bc[ 3] = ~st[ 9] & st[ 5];
+ bc[ 4] = ~st[ 5] & st[ 6];
+ st[ 5] ^= bc[ 0];
+ st[ 6] ^= bc[ 1];
+ st[ 7] ^= bc[ 2];
+ st[ 8] ^= bc[ 3];
+ st[ 9] ^= bc[ 4];
+
+ bc[ 0] = ~st[11] & st[12];
+ bc[ 1] = ~st[12] & st[13];
+ bc[ 2] = ~st[13] & st[14];
+ bc[ 3] = ~st[14] & st[10];
+ bc[ 4] = ~st[10] & st[11];
+ st[10] ^= bc[ 0];
+ st[11] ^= bc[ 1];
+ st[12] ^= bc[ 2];
+ st[13] ^= bc[ 3];
+ st[14] ^= bc[ 4];
+
+ bc[ 0] = ~st[16] & st[17];
+ bc[ 1] = ~st[17] & st[18];
+ bc[ 2] = ~st[18] & st[19];
+ bc[ 3] = ~st[19] & st[15];
+ bc[ 4] = ~st[15] & st[16];
+ st[15] ^= bc[ 0];
+ st[16] ^= bc[ 1];
+ st[17] ^= bc[ 2];
+ st[18] ^= bc[ 3];
+ st[19] ^= bc[ 4];
+
+ bc[ 0] = ~st[21] & st[22];
+ bc[ 1] = ~st[22] & st[23];
+ bc[ 2] = ~st[23] & st[24];
+ bc[ 3] = ~st[24] & st[20];
+ bc[ 4] = ~st[20] & st[21];
+ st[20] ^= bc[ 0];
+ st[21] ^= bc[ 1];
+ st[22] ^= bc[ 2];
+ st[23] ^= bc[ 3];
+ st[24] ^= bc[ 4];
+
+ /* Iota */
+ st[0] ^= sha3_keccakf_rndc[round];
+}
+
+/* Generic implementation of the Keccak-f[1600] permutation */
+static void sha3_keccakf_generic(struct sha3_state *state)
+{
+ /*
+ * Temporarily convert the state words from little-endian to native-
+ * endian so that they can be operated on. Note that on little-endian
+ * machines this conversion is a no-op and is optimized out.
+ */
+
+ for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+ state->native_words[i] = le64_to_cpu(state->words[i]);
+
+ for (int round = 0; round < SHA3_KECCAK_ROUNDS; round++)
+ sha3_keccakf_one_round_generic(state->native_words, round);
+
+ for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+ state->words[i] = cpu_to_le64(state->native_words[i]);
+}
+
+/*
+ * Generic implementation of absorbing the given nonzero number of full blocks
+ * into the sponge function Keccak[r=8*block_size, c=1600-8*block_size].
+ */
+static void __maybe_unused
+sha3_absorb_blocks_generic(struct sha3_state *state, const u8 *data,
+ size_t nblocks, size_t block_size)
+{
+ do {
+ for (size_t i = 0; i < block_size; i += 8)
+ state->words[i / 8] ^= get_unaligned((__le64 *)&data[i]);
+ sha3_keccakf_generic(state);
+ data += block_size;
+ } while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA3_ARCH
+#include "sha3.h" /* $(SRCARCH)/sha3.h */
+#else
+#define sha3_keccakf sha3_keccakf_generic
+#define sha3_absorb_blocks sha3_absorb_blocks_generic
+#endif
+
+void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len)
+{
+ const size_t block_size = ctx->block_size;
+ size_t absorb_offset = ctx->absorb_offset;
+
+ /* Warn if squeezing has already begun. */
+ WARN_ON_ONCE(absorb_offset >= block_size);
+
+ if (absorb_offset && absorb_offset + in_len >= block_size) {
+ crypto_xor(&ctx->state.bytes[absorb_offset], in,
+ block_size - absorb_offset);
+ in += block_size - absorb_offset;
+ in_len -= block_size - absorb_offset;
+ sha3_keccakf(&ctx->state);
+ absorb_offset = 0;
+ }
+
+ if (in_len >= block_size) {
+ size_t nblocks = in_len / block_size;
+
+ sha3_absorb_blocks(&ctx->state, in, nblocks, block_size);
+ in += nblocks * block_size;
+ in_len -= nblocks * block_size;
+ }
+
+ if (in_len) {
+ crypto_xor(&ctx->state.bytes[absorb_offset], in, in_len);
+ absorb_offset += in_len;
+ }
+ ctx->absorb_offset = absorb_offset;
+}
+EXPORT_SYMBOL_GPL(__sha3_update);
+
+void sha3_final(struct sha3_ctx *sha3_ctx, u8 *out)
+{
+ struct __sha3_ctx *ctx = &sha3_ctx->ctx;
+
+ ctx->state.bytes[ctx->absorb_offset] ^= 0x06;
+ ctx->state.bytes[ctx->block_size - 1] ^= 0x80;
+ sha3_keccakf(&ctx->state);
+ memcpy(out, ctx->state.bytes, ctx->digest_size);
+ sha3_zeroize_ctx(sha3_ctx);
+}
+EXPORT_SYMBOL_GPL(sha3_final);
+
+void shake_squeeze(struct shake_ctx *shake_ctx, u8 *out, size_t out_len)
+{
+ struct __sha3_ctx *ctx = &shake_ctx->ctx;
+ const size_t block_size = ctx->block_size;
+ size_t squeeze_offset = ctx->squeeze_offset;
+
+ if (ctx->absorb_offset < block_size) {
+ /* First squeeze: */
+
+ /* Add the domain separation suffix and padding. */
+ ctx->state.bytes[ctx->absorb_offset] ^= 0x1f;
+ ctx->state.bytes[block_size - 1] ^= 0x80;
+
+ /* Indicate that squeezing has begun. */
+ ctx->absorb_offset = block_size;
+
+ /*
+ * Indicate that no output is pending yet, i.e. sha3_keccakf()
+ * will need to be called before the first copy.
+ */
+ squeeze_offset = block_size;
+ }
+ while (out_len) {
+ if (squeeze_offset == block_size) {
+ sha3_keccakf(&ctx->state);
+ squeeze_offset = 0;
+ }
+ size_t copy = min(out_len, block_size - squeeze_offset);
+
+ memcpy(out, &ctx->state.bytes[squeeze_offset], copy);
+ out += copy;
+ out_len -= copy;
+ squeeze_offset += copy;
+ }
+ ctx->squeeze_offset = squeeze_offset;
+}
+EXPORT_SYMBOL_GPL(shake_squeeze);
+
+#ifndef sha3_224_arch
+static inline bool sha3_224_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_224_DIGEST_SIZE])
+{
+ return false;
+}
+#endif
+#ifndef sha3_256_arch
+static inline bool sha3_256_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_256_DIGEST_SIZE])
+{
+ return false;
+}
+#endif
+#ifndef sha3_384_arch
+static inline bool sha3_384_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_384_DIGEST_SIZE])
+{
+ return false;
+}
+#endif
+#ifndef sha3_512_arch
+static inline bool sha3_512_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_512_DIGEST_SIZE])
+{
+ return false;
+}
+#endif
+
+void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE])
+{
+ struct sha3_ctx ctx;
+
+ if (sha3_224_arch(in, in_len, out))
+ return;
+ sha3_224_init(&ctx);
+ sha3_update(&ctx, in, in_len);
+ sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_224);
+
+void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE])
+{
+ struct sha3_ctx ctx;
+
+ if (sha3_256_arch(in, in_len, out))
+ return;
+ sha3_256_init(&ctx);
+ sha3_update(&ctx, in, in_len);
+ sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_256);
+
+void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE])
+{
+ struct sha3_ctx ctx;
+
+ if (sha3_384_arch(in, in_len, out))
+ return;
+ sha3_384_init(&ctx);
+ sha3_update(&ctx, in, in_len);
+ sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_384);
+
+void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE])
+{
+ struct sha3_ctx ctx;
+
+ if (sha3_512_arch(in, in_len, out))
+ return;
+ sha3_512_init(&ctx);
+ sha3_update(&ctx, in, in_len);
+ sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_512);
+
+void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+ struct shake_ctx ctx;
+
+ shake128_init(&ctx);
+ shake_update(&ctx, in, in_len);
+ shake_squeeze(&ctx, out, out_len);
+ shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake128);
+
+void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+ struct shake_ctx ctx;
+
+ shake256_init(&ctx);
+ shake_update(&ctx, in, in_len);
+ shake_squeeze(&ctx, out, out_len);
+ shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake256);
+
+#if defined(sha3_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha3_mod_init(void)
+{
+#ifdef sha3_mod_init_arch
+ sha3_mod_init_arch();
+#endif
+ if (fips_enabled) {
+ /*
+ * FIPS cryptographic algorithm self-test. As per the FIPS
+ * Implementation Guidance, testing any SHA-3 algorithm
+ * satisfies the test requirement for all of them.
+ */
+ u8 hash[SHA3_256_DIGEST_SIZE];
+
+ sha3_256(fips_test_data, sizeof(fips_test_data), hash);
+ if (memcmp(fips_test_sha3_256_value, hash, sizeof(hash)) != 0)
+ panic("sha3: FIPS self-test failed\n");
+ }
+ return 0;
+}
+subsys_initcall(sha3_mod_init);
+
+static void __exit sha3_mod_exit(void)
+{
+}
+module_exit(sha3_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-3 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c
new file mode 100644
index 000000000000..605eab51aabd
--- /dev/null
+++ b/lib/crypto/sha512.c
@@ -0,0 +1,440 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/hmac.h>
+#include <crypto/sha2.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/overflow.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+#include "fips.h"
+
+static const struct sha512_block_state sha384_iv = {
+ .h = {
+ SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3,
+ SHA384_H4, SHA384_H5, SHA384_H6, SHA384_H7,
+ },
+};
+
+static const struct sha512_block_state sha512_iv = {
+ .h = {
+ SHA512_H0, SHA512_H1, SHA512_H2, SHA512_H3,
+ SHA512_H4, SHA512_H5, SHA512_H6, SHA512_H7,
+ },
+};
+
+static const u64 sha512_K[80] = {
+ 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+ 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+ 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+ 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+ 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+ 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+ 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+ 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+ 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+ 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+ 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+ 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+ 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+ 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+ 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+ 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+ 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+ 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+ 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+ 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+ 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+ 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+ 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+ 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+ 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+ 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+ 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+};
+
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
+#define e0(x) (ror64((x), 28) ^ ror64((x), 34) ^ ror64((x), 39))
+#define e1(x) (ror64((x), 14) ^ ror64((x), 18) ^ ror64((x), 41))
+#define s0(x) (ror64((x), 1) ^ ror64((x), 8) ^ ((x) >> 7))
+#define s1(x) (ror64((x), 19) ^ ror64((x), 61) ^ ((x) >> 6))
+
+static void sha512_block_generic(struct sha512_block_state *state,
+ const u8 *data)
+{
+ u64 a = state->h[0];
+ u64 b = state->h[1];
+ u64 c = state->h[2];
+ u64 d = state->h[3];
+ u64 e = state->h[4];
+ u64 f = state->h[5];
+ u64 g = state->h[6];
+ u64 h = state->h[7];
+ u64 t1, t2;
+ u64 W[16];
+
+ for (int j = 0; j < 16; j++)
+ W[j] = get_unaligned_be64(data + j * sizeof(u64));
+
+ for (int i = 0; i < 80; i += 8) {
+ if ((i & 15) == 0 && i != 0) {
+ for (int j = 0; j < 16; j++) {
+ W[j & 15] += s1(W[(j - 2) & 15]) +
+ W[(j - 7) & 15] +
+ s0(W[(j - 15) & 15]);
+ }
+ }
+ t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i] + W[(i & 15)];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2;
+ t1 = g + e1(d) + Ch(d, e, f) + sha512_K[i+1] + W[(i & 15) + 1];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2;
+ t1 = f + e1(c) + Ch(c, d, e) + sha512_K[i+2] + W[(i & 15) + 2];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2;
+ t1 = e + e1(b) + Ch(b, c, d) + sha512_K[i+3] + W[(i & 15) + 3];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2;
+ t1 = d + e1(a) + Ch(a, b, c) + sha512_K[i+4] + W[(i & 15) + 4];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2;
+ t1 = c + e1(h) + Ch(h, a, b) + sha512_K[i+5] + W[(i & 15) + 5];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2;
+ t1 = b + e1(g) + Ch(g, h, a) + sha512_K[i+6] + W[(i & 15) + 6];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2;
+ t1 = a + e1(f) + Ch(f, g, h) + sha512_K[i+7] + W[(i & 15) + 7];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2;
+ }
+
+ state->h[0] += a;
+ state->h[1] += b;
+ state->h[2] += c;
+ state->h[3] += d;
+ state->h[4] += e;
+ state->h[5] += f;
+ state->h[6] += g;
+ state->h[7] += h;
+}
+
+static void __maybe_unused
+sha512_blocks_generic(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ sha512_block_generic(state, data);
+ data += SHA512_BLOCK_SIZE;
+ } while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA512_ARCH
+#include "sha512.h" /* $(SRCARCH)/sha512.h */
+#else
+#define sha512_blocks sha512_blocks_generic
+#endif
+
+static void __sha512_init(struct __sha512_ctx *ctx,
+ const struct sha512_block_state *iv,
+ u64 initial_bytecount)
+{
+ ctx->state = *iv;
+ ctx->bytecount_lo = initial_bytecount;
+ ctx->bytecount_hi = 0;
+}
+
+void sha384_init(struct sha384_ctx *ctx)
+{
+ __sha512_init(&ctx->ctx, &sha384_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha384_init);
+
+void sha512_init(struct sha512_ctx *ctx)
+{
+ __sha512_init(&ctx->ctx, &sha512_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha512_init);
+
+void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len)
+{
+ size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;
+
+ if (check_add_overflow(ctx->bytecount_lo, len, &ctx->bytecount_lo))
+ ctx->bytecount_hi++;
+
+ if (partial + len >= SHA512_BLOCK_SIZE) {
+ size_t nblocks;
+
+ if (partial) {
+ size_t l = SHA512_BLOCK_SIZE - partial;
+
+ memcpy(&ctx->buf[partial], data, l);
+ data += l;
+ len -= l;
+
+ sha512_blocks(&ctx->state, ctx->buf, 1);
+ }
+
+ nblocks = len / SHA512_BLOCK_SIZE;
+ len %= SHA512_BLOCK_SIZE;
+
+ if (nblocks) {
+ sha512_blocks(&ctx->state, data, nblocks);
+ data += nblocks * SHA512_BLOCK_SIZE;
+ }
+ partial = 0;
+ }
+ if (len)
+ memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(__sha512_update);
+
+static void __sha512_final(struct __sha512_ctx *ctx,
+ u8 *out, size_t digest_size)
+{
+ u64 bitcount_hi = (ctx->bytecount_hi << 3) | (ctx->bytecount_lo >> 61);
+ u64 bitcount_lo = ctx->bytecount_lo << 3;
+ size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;
+
+ ctx->buf[partial++] = 0x80;
+ if (partial > SHA512_BLOCK_SIZE - 16) {
+ memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - partial);
+ sha512_blocks(&ctx->state, ctx->buf, 1);
+ partial = 0;
+ }
+ memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - 16 - partial);
+ *(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 16] = cpu_to_be64(bitcount_hi);
+ *(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 8] = cpu_to_be64(bitcount_lo);
+ sha512_blocks(&ctx->state, ctx->buf, 1);
+
+ for (size_t i = 0; i < digest_size; i += 8)
+ put_unaligned_be64(ctx->state.h[i / 8], out + i);
+}
+
+void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE])
+{
+ __sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha384_final);
+
+void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE])
+{
+ __sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha512_final);
+
+void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE])
+{
+ struct sha384_ctx ctx;
+
+ sha384_init(&ctx);
+ sha384_update(&ctx, data, len);
+ sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha384);
+
+void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE])
+{
+ struct sha512_ctx ctx;
+
+ sha512_init(&ctx);
+ sha512_update(&ctx, data, len);
+ sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha512);
+
+static void __hmac_sha512_preparekey(struct sha512_block_state *istate,
+ struct sha512_block_state *ostate,
+ const u8 *raw_key, size_t raw_key_len,
+ const struct sha512_block_state *iv)
+{
+ union {
+ u8 b[SHA512_BLOCK_SIZE];
+ unsigned long w[SHA512_BLOCK_SIZE / sizeof(unsigned long)];
+ } derived_key = { 0 };
+
+ if (unlikely(raw_key_len > SHA512_BLOCK_SIZE)) {
+ if (iv == &sha384_iv)
+ sha384(raw_key, raw_key_len, derived_key.b);
+ else
+ sha512(raw_key, raw_key_len, derived_key.b);
+ } else {
+ memcpy(derived_key.b, raw_key, raw_key_len);
+ }
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+ *istate = *iv;
+ sha512_blocks(istate, derived_key.b, 1);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+ HMAC_IPAD_VALUE);
+ *ostate = *iv;
+ sha512_blocks(ostate, derived_key.b, 1);
+
+ memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha384_preparekey(struct hmac_sha384_key *key,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
+ raw_key, raw_key_len, &sha384_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_preparekey);
+
+void hmac_sha512_preparekey(struct hmac_sha512_key *key,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
+ raw_key, raw_key_len, &sha512_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_preparekey);
+
+void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx,
+ const struct __hmac_sha512_key *key)
+{
+ __sha512_init(&ctx->sha_ctx, &key->istate, SHA512_BLOCK_SIZE);
+ ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(__hmac_sha512_init);
+
+void hmac_sha384_init_usingrawkey(struct hmac_sha384_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+ raw_key, raw_key_len, &sha384_iv);
+ ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
+ ctx->ctx.sha_ctx.bytecount_hi = 0;
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_init_usingrawkey);
+
+void hmac_sha512_init_usingrawkey(struct hmac_sha512_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+ raw_key, raw_key_len, &sha512_iv);
+ ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
+ ctx->ctx.sha_ctx.bytecount_hi = 0;
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_init_usingrawkey);
+
+static void __hmac_sha512_final(struct __hmac_sha512_ctx *ctx,
+ u8 *out, size_t digest_size)
+{
+ /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+ __sha512_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
+ memset(&ctx->sha_ctx.buf[digest_size], 0,
+ SHA512_BLOCK_SIZE - digest_size);
+ ctx->sha_ctx.buf[digest_size] = 0x80;
+ *(__be32 *)&ctx->sha_ctx.buf[SHA512_BLOCK_SIZE - 4] =
+ cpu_to_be32(8 * (SHA512_BLOCK_SIZE + digest_size));
+
+ /* Compute the outer hash, which gives the HMAC value. */
+ sha512_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+ for (size_t i = 0; i < digest_size; i += 8)
+ put_unaligned_be64(ctx->ostate.h[i / 8], out + i);
+
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+
+void hmac_sha384_final(struct hmac_sha384_ctx *ctx,
+ u8 out[SHA384_DIGEST_SIZE])
+{
+ __hmac_sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_final);
+
+void hmac_sha512_final(struct hmac_sha512_ctx *ctx,
+ u8 out[SHA512_DIGEST_SIZE])
+{
+ __hmac_sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_final);
+
+void hmac_sha384(const struct hmac_sha384_key *key,
+ const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE])
+{
+ struct hmac_sha384_ctx ctx;
+
+ hmac_sha384_init(&ctx, key);
+ hmac_sha384_update(&ctx, data, data_len);
+ hmac_sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384);
+
+void hmac_sha512(const struct hmac_sha512_key *key,
+ const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE])
+{
+ struct hmac_sha512_ctx ctx;
+
+ hmac_sha512_init(&ctx, key);
+ hmac_sha512_update(&ctx, data, data_len);
+ hmac_sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512);
+
+void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[SHA384_DIGEST_SIZE])
+{
+ struct hmac_sha384_ctx ctx;
+
+ hmac_sha384_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_sha384_update(&ctx, data, data_len);
+ hmac_sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_usingrawkey);
+
+void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[SHA512_DIGEST_SIZE])
+{
+ struct hmac_sha512_ctx ctx;
+
+ hmac_sha512_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_sha512_update(&ctx, data, data_len);
+ hmac_sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey);
+
+#if defined(sha512_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha512_mod_init(void)
+{
+#ifdef sha512_mod_init_arch
+ sha512_mod_init_arch();
+#endif
+ if (fips_enabled) {
+ /*
+ * FIPS cryptographic algorithm self-test. As per the FIPS
+ * Implementation Guidance, testing HMAC-SHA512 satisfies the
+ * test requirement for SHA-384, SHA-512, and HMAC-SHA384 too.
+ */
+ u8 mac[SHA512_DIGEST_SIZE];
+
+ hmac_sha512_usingrawkey(fips_test_key, sizeof(fips_test_key),
+ fips_test_data, sizeof(fips_test_data),
+ mac);
+ if (memcmp(fips_test_hmac_sha512_value, mac, sizeof(mac)) != 0)
+ panic("sha512: FIPS self-test failed\n");
+ }
+ return 0;
+}
+subsys_initcall(sha512_mod_init);
+
+static void __exit sha512_mod_exit(void)
+{
+}
+module_exit(sha512_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sm3.c b/lib/crypto/sm3.c
index efff0e267d84..c6b9ad8a3ac6 100644
--- a/lib/crypto/sm3.c
+++ b/lib/crypto/sm3.c
@@ -9,6 +9,7 @@
*/
#include <crypto/sm3.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
diff --git a/lib/crypto/sparc/md5.h b/lib/crypto/sparc/md5.h
new file mode 100644
index 000000000000..3995f3e075eb
--- /dev/null
+++ b/lib/crypto/sparc/md5.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * MD5 accelerated using the sparc64 crypto opcodes
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_md5_opcodes);
+
+asmlinkage void md5_sparc64_transform(struct md5_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void md5_blocks(struct md5_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_md5_opcodes)) {
+ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+ md5_sparc64_transform(state, data, nblocks);
+ le32_to_cpu_array(state->h, ARRAY_SIZE(state->h));
+ } else {
+ md5_blocks_generic(state, data, nblocks);
+ }
+}
+
+#define md5_mod_init_arch md5_mod_init_arch
+static void md5_mod_init_arch(void)
+{
+ unsigned long cfr;
+
+ if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+ return;
+
+ __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+ if (!(cfr & CFR_MD5))
+ return;
+
+ static_branch_enable(&have_md5_opcodes);
+ pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n");
+}
diff --git a/lib/crypto/sparc/md5_asm.S b/lib/crypto/sparc/md5_asm.S
new file mode 100644
index 000000000000..60b544e4d205
--- /dev/null
+++ b/lib/crypto/sparc/md5_asm.S
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(md5_sparc64_transform)
+ /* %o0 = digest, %o1 = data, %o2 = rounds */
+ VISEntryHalf
+ ld [%o0 + 0x00], %f0
+ ld [%o0 + 0x04], %f1
+ andcc %o1, 0x7, %g0
+ ld [%o0 + 0x08], %f2
+ bne,pn %xcc, 10f
+ ld [%o0 + 0x0c], %f3
+
+1:
+ ldd [%o1 + 0x00], %f8
+ ldd [%o1 + 0x08], %f10
+ ldd [%o1 + 0x10], %f12
+ ldd [%o1 + 0x18], %f14
+ ldd [%o1 + 0x20], %f16
+ ldd [%o1 + 0x28], %f18
+ ldd [%o1 + 0x30], %f20
+ ldd [%o1 + 0x38], %f22
+
+ MD5
+
+ subcc %o2, 1, %o2
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+5:
+ st %f0, [%o0 + 0x00]
+ st %f1, [%o0 + 0x04]
+ st %f2, [%o0 + 0x08]
+ st %f3, [%o0 + 0x0c]
+ retl
+ VISExitHalf
+10:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f10
+1:
+ ldd [%o1 + 0x08], %f12
+ ldd [%o1 + 0x10], %f14
+ ldd [%o1 + 0x18], %f16
+ ldd [%o1 + 0x20], %f18
+ ldd [%o1 + 0x28], %f20
+ ldd [%o1 + 0x30], %f22
+ ldd [%o1 + 0x38], %f24
+ ldd [%o1 + 0x40], %f26
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ MD5
+
+ subcc %o2, 1, %o2
+ fsrc2 %f26, %f10
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+ ba,a,pt %xcc, 5b
+ENDPROC(md5_sparc64_transform)
diff --git a/lib/crypto/sparc/sha1.h b/lib/crypto/sparc/sha1.h
new file mode 100644
index 000000000000..bdf771fcc1f7
--- /dev/null
+++ b/lib/crypto/sparc/sha1.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-1 accelerated using the sparc64 crypto opcodes
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha1_opcodes);
+
+asmlinkage void sha1_sparc64_transform(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_sha1_opcodes))
+ sha1_sparc64_transform(state, data, nblocks);
+ else
+ sha1_blocks_generic(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+ unsigned long cfr;
+
+ if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+ return;
+
+ __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+ if (!(cfr & CFR_SHA1))
+ return;
+
+ static_branch_enable(&have_sha1_opcodes);
+ pr_info("Using sparc64 sha1 opcode optimized SHA-1 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha1_asm.S b/lib/crypto/sparc/sha1_asm.S
new file mode 100644
index 000000000000..00b46bac1b08
--- /dev/null
+++ b/lib/crypto/sparc/sha1_asm.S
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha1_sparc64_transform)
+ /* %o0 = digest, %o1 = data, %o2 = rounds */
+ VISEntryHalf
+ ld [%o0 + 0x00], %f0
+ ld [%o0 + 0x04], %f1
+ ld [%o0 + 0x08], %f2
+ andcc %o1, 0x7, %g0
+ ld [%o0 + 0x0c], %f3
+ bne,pn %xcc, 10f
+ ld [%o0 + 0x10], %f4
+
+1:
+ ldd [%o1 + 0x00], %f8
+ ldd [%o1 + 0x08], %f10
+ ldd [%o1 + 0x10], %f12
+ ldd [%o1 + 0x18], %f14
+ ldd [%o1 + 0x20], %f16
+ ldd [%o1 + 0x28], %f18
+ ldd [%o1 + 0x30], %f20
+ ldd [%o1 + 0x38], %f22
+
+ SHA1
+
+ subcc %o2, 1, %o2
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+5:
+ st %f0, [%o0 + 0x00]
+ st %f1, [%o0 + 0x04]
+ st %f2, [%o0 + 0x08]
+ st %f3, [%o0 + 0x0c]
+ st %f4, [%o0 + 0x10]
+ retl
+ VISExitHalf
+10:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f10
+1:
+ ldd [%o1 + 0x08], %f12
+ ldd [%o1 + 0x10], %f14
+ ldd [%o1 + 0x18], %f16
+ ldd [%o1 + 0x20], %f18
+ ldd [%o1 + 0x28], %f20
+ ldd [%o1 + 0x30], %f22
+ ldd [%o1 + 0x38], %f24
+ ldd [%o1 + 0x40], %f26
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ SHA1
+
+ subcc %o2, 1, %o2
+ fsrc2 %f26, %f10
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+ ba,a,pt %xcc, 5b
+ENDPROC(sha1_sparc64_transform)
diff --git a/lib/crypto/sparc/sha256.h b/lib/crypto/sparc/sha256.h
new file mode 100644
index 000000000000..b2f4419ec778
--- /dev/null
+++ b/lib/crypto/sparc/sha256.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-256 accelerated using the sparc64 sha256 opcodes
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes);
+
+asmlinkage void sha256_sparc64_transform(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_sha256_opcodes))
+ sha256_sparc64_transform(state, data, nblocks);
+ else
+ sha256_blocks_generic(state, data, nblocks);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ unsigned long cfr;
+
+ if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+ return;
+
+ __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+ if (!(cfr & CFR_SHA256))
+ return;
+
+ static_branch_enable(&have_sha256_opcodes);
+ pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha256_asm.S b/lib/crypto/sparc/sha256_asm.S
new file mode 100644
index 000000000000..ddcdd3daf31e
--- /dev/null
+++ b/lib/crypto/sparc/sha256_asm.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha256_sparc64_transform)
+ /* %o0 = state, %o1 = data, %o2 = nblocks */
+ VISEntryHalf
+ ld [%o0 + 0x00], %f0
+ ld [%o0 + 0x04], %f1
+ ld [%o0 + 0x08], %f2
+ ld [%o0 + 0x0c], %f3
+ ld [%o0 + 0x10], %f4
+ ld [%o0 + 0x14], %f5
+ andcc %o1, 0x7, %g0
+ ld [%o0 + 0x18], %f6
+ bne,pn %xcc, 10f
+ ld [%o0 + 0x1c], %f7
+
+1:
+ ldd [%o1 + 0x00], %f8
+ ldd [%o1 + 0x08], %f10
+ ldd [%o1 + 0x10], %f12
+ ldd [%o1 + 0x18], %f14
+ ldd [%o1 + 0x20], %f16
+ ldd [%o1 + 0x28], %f18
+ ldd [%o1 + 0x30], %f20
+ ldd [%o1 + 0x38], %f22
+
+ SHA256
+
+ subcc %o2, 1, %o2
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+5:
+ st %f0, [%o0 + 0x00]
+ st %f1, [%o0 + 0x04]
+ st %f2, [%o0 + 0x08]
+ st %f3, [%o0 + 0x0c]
+ st %f4, [%o0 + 0x10]
+ st %f5, [%o0 + 0x14]
+ st %f6, [%o0 + 0x18]
+ st %f7, [%o0 + 0x1c]
+ retl
+ VISExitHalf
+10:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f10
+1:
+ ldd [%o1 + 0x08], %f12
+ ldd [%o1 + 0x10], %f14
+ ldd [%o1 + 0x18], %f16
+ ldd [%o1 + 0x20], %f18
+ ldd [%o1 + 0x28], %f20
+ ldd [%o1 + 0x30], %f22
+ ldd [%o1 + 0x38], %f24
+ ldd [%o1 + 0x40], %f26
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ SHA256
+
+ subcc %o2, 1, %o2
+ fsrc2 %f26, %f10
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+ ba,a,pt %xcc, 5b
+ENDPROC(sha256_sparc64_transform)
diff --git a/lib/crypto/sparc/sha512.h b/lib/crypto/sparc/sha512.h
new file mode 100644
index 000000000000..a8c37a7d4c39
--- /dev/null
+++ b/lib/crypto/sparc/sha512.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-512 accelerated using the sparc64 sha512 opcodes
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_opcodes);
+
+asmlinkage void sha512_sparc64_transform(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_sha512_opcodes))
+ sha512_sparc64_transform(state, data, nblocks);
+ else
+ sha512_blocks_generic(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ unsigned long cfr;
+
+ if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+ return;
+
+ __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+ if (!(cfr & CFR_SHA512))
+ return;
+
+ static_branch_enable(&have_sha512_opcodes);
+ pr_info("Using sparc64 sha512 opcode optimized SHA-512/SHA-384 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha512_asm.S b/lib/crypto/sparc/sha512_asm.S
new file mode 100644
index 000000000000..9932b4fe1b59
--- /dev/null
+++ b/lib/crypto/sparc/sha512_asm.S
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha512_sparc64_transform)
+ /* %o0 = digest, %o1 = data, %o2 = rounds */
+ VISEntry
+ ldd [%o0 + 0x00], %f0
+ ldd [%o0 + 0x08], %f2
+ ldd [%o0 + 0x10], %f4
+ ldd [%o0 + 0x18], %f6
+ ldd [%o0 + 0x20], %f8
+ ldd [%o0 + 0x28], %f10
+ andcc %o1, 0x7, %g0
+ ldd [%o0 + 0x30], %f12
+ bne,pn %xcc, 10f
+ ldd [%o0 + 0x38], %f14
+
+1:
+ ldd [%o1 + 0x00], %f16
+ ldd [%o1 + 0x08], %f18
+ ldd [%o1 + 0x10], %f20
+ ldd [%o1 + 0x18], %f22
+ ldd [%o1 + 0x20], %f24
+ ldd [%o1 + 0x28], %f26
+ ldd [%o1 + 0x30], %f28
+ ldd [%o1 + 0x38], %f30
+ ldd [%o1 + 0x40], %f32
+ ldd [%o1 + 0x48], %f34
+ ldd [%o1 + 0x50], %f36
+ ldd [%o1 + 0x58], %f38
+ ldd [%o1 + 0x60], %f40
+ ldd [%o1 + 0x68], %f42
+ ldd [%o1 + 0x70], %f44
+ ldd [%o1 + 0x78], %f46
+
+ SHA512
+
+ subcc %o2, 1, %o2
+ bne,pt %xcc, 1b
+ add %o1, 0x80, %o1
+
+5:
+ std %f0, [%o0 + 0x00]
+ std %f2, [%o0 + 0x08]
+ std %f4, [%o0 + 0x10]
+ std %f6, [%o0 + 0x18]
+ std %f8, [%o0 + 0x20]
+ std %f10, [%o0 + 0x28]
+ std %f12, [%o0 + 0x30]
+ std %f14, [%o0 + 0x38]
+ retl
+ VISExit
+10:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f18
+1:
+ ldd [%o1 + 0x08], %f20
+ ldd [%o1 + 0x10], %f22
+ ldd [%o1 + 0x18], %f24
+ ldd [%o1 + 0x20], %f26
+ ldd [%o1 + 0x28], %f28
+ ldd [%o1 + 0x30], %f30
+ ldd [%o1 + 0x38], %f32
+ ldd [%o1 + 0x40], %f34
+ ldd [%o1 + 0x48], %f36
+ ldd [%o1 + 0x50], %f38
+ ldd [%o1 + 0x58], %f40
+ ldd [%o1 + 0x60], %f42
+ ldd [%o1 + 0x68], %f44
+ ldd [%o1 + 0x70], %f46
+ ldd [%o1 + 0x78], %f48
+ ldd [%o1 + 0x80], %f50
+
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+ faligndata %f26, %f28, %f24
+ faligndata %f28, %f30, %f26
+ faligndata %f30, %f32, %f28
+ faligndata %f32, %f34, %f30
+ faligndata %f34, %f36, %f32
+ faligndata %f36, %f38, %f34
+ faligndata %f38, %f40, %f36
+ faligndata %f40, %f42, %f38
+ faligndata %f42, %f44, %f40
+ faligndata %f44, %f46, %f42
+ faligndata %f46, %f48, %f44
+ faligndata %f48, %f50, %f46
+
+ SHA512
+
+ subcc %o2, 1, %o2
+ fsrc2 %f50, %f18
+ bne,pt %xcc, 1b
+ add %o1, 0x80, %o1
+
+ ba,a,pt %xcc, 5b
+ENDPROC(sha512_sparc64_transform)
diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig
new file mode 100644
index 000000000000..61d435c450bb
--- /dev/null
+++ b/lib/crypto/tests/Kconfig
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config CRYPTO_LIB_BLAKE2B_KUNIT_TEST
+ tristate "KUnit tests for BLAKE2b" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_BLAKE2B
+ help
+ KUnit tests for the BLAKE2b cryptographic hash function.
+
+config CRYPTO_LIB_BLAKE2S_KUNIT_TEST
+ tristate "KUnit tests for BLAKE2s" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ # No need to select CRYPTO_LIB_BLAKE2S here, as that option doesn't
+ # exist; the BLAKE2s code is always built-in for the /dev/random driver.
+ help
+ KUnit tests for the BLAKE2s cryptographic hash function.
+
+config CRYPTO_LIB_CURVE25519_KUNIT_TEST
+ tristate "KUnit tests for Curve25519" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_CURVE25519
+ help
+ KUnit tests for the Curve25519 Diffie-Hellman function.
+
+config CRYPTO_LIB_MD5_KUNIT_TEST
+ tristate "KUnit tests for MD5" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_MD5
+ help
+ KUnit tests for the MD5 cryptographic hash function and its
+ corresponding HMAC.
+
+config CRYPTO_LIB_POLY1305_KUNIT_TEST
+ tristate "KUnit tests for Poly1305" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_POLY1305
+ help
+ KUnit tests for the Poly1305 library functions.
+
+config CRYPTO_LIB_POLYVAL_KUNIT_TEST
+ tristate "KUnit tests for POLYVAL" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_POLYVAL
+ help
+ KUnit tests for the POLYVAL library functions.
+
+config CRYPTO_LIB_SHA1_KUNIT_TEST
+ tristate "KUnit tests for SHA-1" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_SHA1
+ help
+ KUnit tests for the SHA-1 cryptographic hash function and its
+ corresponding HMAC.
+
+# Option is named *_SHA256_KUNIT_TEST, though both SHA-224 and SHA-256 tests are
+# included, for consistency with the naming used elsewhere (e.g. CRYPTO_SHA256).
+config CRYPTO_LIB_SHA256_KUNIT_TEST
+ tristate "KUnit tests for SHA-224 and SHA-256" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_SHA256
+ help
+ KUnit tests for the SHA-224 and SHA-256 cryptographic hash functions
+ and their corresponding HMACs.
+
+# Option is named *_SHA512_KUNIT_TEST, though both SHA-384 and SHA-512 tests are
+# included, for consistency with the naming used elsewhere (e.g. CRYPTO_SHA512).
+config CRYPTO_LIB_SHA512_KUNIT_TEST
+ tristate "KUnit tests for SHA-384 and SHA-512" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_SHA512
+ help
+ KUnit tests for the SHA-384 and SHA-512 cryptographic hash functions
+ and their corresponding HMACs.
+
+config CRYPTO_LIB_SHA3_KUNIT_TEST
+ tristate "KUnit tests for SHA-3" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_SHA3
+ help
+ KUnit tests for the SHA3 cryptographic hash and XOF functions,
+ including SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128 and
+ SHAKE256.
+
+config CRYPTO_LIB_BENCHMARK_VISIBLE
+ bool
+
+config CRYPTO_LIB_BENCHMARK
+ bool "Include benchmarks in KUnit tests for cryptographic functions"
+ depends on CRYPTO_LIB_BENCHMARK_VISIBLE
+ help
+ Include benchmarks in the KUnit tests for cryptographic functions.
+ The benchmark results are printed to the kernel log when the
+ corresponding KUnit test suite runs.
+
+ This is useful for evaluating the performance of the cryptographic
+ functions. However, it will increase the runtime of the KUnit tests.
+
+ If you're only interested in correctness testing, leave this disabled.
diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile
new file mode 100644
index 000000000000..5109a0651925
--- /dev/null
+++ b/lib/crypto/tests/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B_KUNIT_TEST) += blake2b_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_KUNIT_TEST) += blake2s_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519_KUNIT_TEST) += curve25519_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL_KUNIT_TEST) += polyval_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA256_KUNIT_TEST) += sha224_kunit.o sha256_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA512_KUNIT_TEST) += sha384_kunit.o sha512_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA3_KUNIT_TEST) += sha3_kunit.o
diff --git a/lib/crypto/tests/blake2b-testvecs.h b/lib/crypto/tests/blake2b-testvecs.h
new file mode 100644
index 000000000000..9e407dbc219c
--- /dev/null
+++ b/lib/crypto/tests/blake2b-testvecs.h
@@ -0,0 +1,342 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py blake2b */
+
+static const struct {
+ size_t data_len;
+ u8 digest[BLAKE2B_HASH_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0x78, 0x6a, 0x02, 0xf7, 0x42, 0x01, 0x59, 0x03,
+ 0xc6, 0xc6, 0xfd, 0x85, 0x25, 0x52, 0xd2, 0x72,
+ 0x91, 0x2f, 0x47, 0x40, 0xe1, 0x58, 0x47, 0x61,
+ 0x8a, 0x86, 0xe2, 0x17, 0xf7, 0x1f, 0x54, 0x19,
+ 0xd2, 0x5e, 0x10, 0x31, 0xaf, 0xee, 0x58, 0x53,
+ 0x13, 0x89, 0x64, 0x44, 0x93, 0x4e, 0xb0, 0x4b,
+ 0x90, 0x3a, 0x68, 0x5b, 0x14, 0x48, 0xb7, 0x55,
+ 0xd5, 0x6f, 0x70, 0x1a, 0xfe, 0x9b, 0xe2, 0xce,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x6f, 0x2e, 0xcc, 0x83, 0x53, 0xa3, 0x20, 0x16,
+ 0x5b, 0xda, 0xd0, 0x04, 0xd3, 0xcb, 0xe4, 0x37,
+ 0x5b, 0xf0, 0x84, 0x36, 0xe1, 0xad, 0x45, 0xcc,
+ 0x4d, 0x7f, 0x09, 0x68, 0xb2, 0x62, 0x93, 0x7f,
+ 0x72, 0x32, 0xe8, 0xa7, 0x2f, 0x1f, 0x6f, 0xc6,
+ 0x14, 0xd6, 0x70, 0xae, 0x0c, 0xf0, 0xf3, 0xce,
+ 0x64, 0x4d, 0x22, 0xdf, 0xc7, 0xa7, 0xf8, 0xa8,
+ 0x18, 0x23, 0xd8, 0x6c, 0xaf, 0x65, 0xa2, 0x54,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x04, 0x13, 0xe2, 0x10, 0xbe, 0x65, 0xde, 0xce,
+ 0x61, 0xa8, 0xe0, 0xd6, 0x35, 0xb1, 0xb8, 0x88,
+ 0xd2, 0xea, 0x45, 0x3a, 0xe1, 0x8d, 0x94, 0xb5,
+ 0x66, 0x06, 0x98, 0x96, 0x39, 0xf8, 0x0e, 0xcb,
+ 0x34, 0xa6, 0xa8, 0x17, 0xfe, 0x56, 0xbc, 0xa9,
+ 0x5e, 0x1b, 0xb1, 0xde, 0x3c, 0xc7, 0x78, 0x4f,
+ 0x39, 0xc6, 0xfc, 0xa8, 0xb3, 0x27, 0x66, 0x3e,
+ 0x4e, 0xb5, 0x5d, 0x08, 0x89, 0xee, 0xd1, 0xe0,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x2b, 0x4a, 0xa3, 0x4e, 0x2b, 0x7a, 0x47, 0x20,
+ 0x30, 0x5b, 0x09, 0x17, 0x3a, 0xf4, 0xcc, 0xf0,
+ 0xf7, 0x7b, 0x97, 0x68, 0x98, 0x9f, 0x4f, 0x09,
+ 0x46, 0x25, 0xe7, 0xd6, 0x53, 0x6b, 0xf9, 0x68,
+ 0x48, 0x12, 0x44, 0x8c, 0x9a, 0xc8, 0xd4, 0x42,
+ 0xeb, 0x2c, 0x5f, 0x41, 0xba, 0x17, 0xd0, 0xc3,
+ 0xad, 0xfd, 0xfb, 0x42, 0x33, 0xcb, 0x08, 0x5d,
+ 0xd2, 0x5c, 0x3d, 0xde, 0x87, 0x4d, 0xd6, 0xe4,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0xbf, 0x40, 0xf2, 0x38, 0x44, 0x8e, 0x24, 0x5e,
+ 0xbc, 0x67, 0xbb, 0xf0, 0x10, 0x9a, 0x79, 0xbb,
+ 0x36, 0x55, 0xce, 0xd2, 0xba, 0x04, 0x0d, 0xe8,
+ 0x30, 0x29, 0x5c, 0x2a, 0xa6, 0x3a, 0x4f, 0x37,
+ 0xac, 0x5f, 0xd4, 0x13, 0xa2, 0xf4, 0xfe, 0x80,
+ 0x61, 0xd7, 0x58, 0x66, 0x0c, 0x7f, 0xa2, 0x56,
+ 0x6b, 0x52, 0x7c, 0x22, 0x73, 0x7f, 0x17, 0xaa,
+ 0x91, 0x5a, 0x22, 0x06, 0xd9, 0x00, 0x48, 0x12,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x41, 0x04, 0x65, 0x93, 0x81, 0x9a, 0x20, 0x0a,
+ 0x00, 0x60, 0x00, 0x64, 0x4c, 0x04, 0x3d, 0xe0,
+ 0x6b, 0x17, 0x0c, 0xe1, 0x0e, 0x28, 0x8b, 0xa0,
+ 0x76, 0xd2, 0x79, 0xb0, 0x33, 0x60, 0x61, 0x27,
+ 0xf2, 0x64, 0xf1, 0x8a, 0xe5, 0x3e, 0xaa, 0x37,
+ 0x60, 0xad, 0x2d, 0x75, 0x13, 0xae, 0xd8, 0x9e,
+ 0xec, 0xe0, 0xe4, 0x40, 0x2f, 0x59, 0x44, 0xb0,
+ 0x66, 0x7a, 0x68, 0x38, 0xce, 0x21, 0x99, 0x2a,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x19, 0x6f, 0x9d, 0xc7, 0x87, 0x12, 0x5c, 0xa3,
+ 0xe2, 0xd3, 0xf1, 0x82, 0xec, 0xf3, 0x55, 0x9c,
+ 0x86, 0xd1, 0x6d, 0xde, 0xcf, 0x5b, 0xec, 0x4c,
+ 0x43, 0x25, 0x85, 0x90, 0xef, 0xe8, 0xe3, 0x5f,
+ 0x2c, 0x3a, 0x84, 0x07, 0xb8, 0x55, 0xfd, 0x5e,
+ 0xa4, 0x45, 0xf2, 0xac, 0xe4, 0xbd, 0xc7, 0x96,
+ 0x80, 0x59, 0x3e, 0xc9, 0xb1, 0x60, 0xb1, 0x2b,
+ 0x17, 0x49, 0x7d, 0x3e, 0x7d, 0x4d, 0x70, 0x24,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x73, 0x72, 0xd5, 0x0a, 0x97, 0xb4, 0x7d, 0xdb,
+ 0x05, 0x14, 0x8e, 0x40, 0xc2, 0x9a, 0x8a, 0x74,
+ 0x4b, 0xda, 0x7e, 0xfc, 0x97, 0x57, 0x23, 0x39,
+ 0xdc, 0x57, 0x09, 0x13, 0x24, 0xfc, 0xf3, 0x23,
+ 0x55, 0x48, 0xdd, 0xe5, 0x07, 0x9a, 0x6f, 0x7b,
+ 0x62, 0xea, 0x4d, 0x79, 0xb4, 0xb9, 0xc5, 0x86,
+ 0xc0, 0x34, 0xd6, 0xd2, 0x6c, 0xc3, 0x94, 0xfb,
+ 0x34, 0xd6, 0x62, 0xae, 0xb8, 0x99, 0xf1, 0x38,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x42, 0x3a, 0xe3, 0xa2, 0xae, 0x5a, 0x28, 0xce,
+ 0xf1, 0x3c, 0x97, 0xc2, 0x34, 0xf6, 0xb5, 0x1e,
+ 0xfc, 0x31, 0xb4, 0x04, 0x61, 0xb7, 0x54, 0x0b,
+ 0x0d, 0x1a, 0x22, 0x9c, 0x04, 0x67, 0x5c, 0x4c,
+ 0x75, 0x1b, 0x10, 0x0b, 0x99, 0xe2, 0xb1, 0x5e,
+ 0x5d, 0x4b, 0x7a, 0xe6, 0xf6, 0xb5, 0x62, 0xee,
+ 0x2d, 0x44, 0x57, 0xb2, 0x96, 0x73, 0x5e, 0xb9,
+ 0x6a, 0xb2, 0xb3, 0x16, 0xa3, 0xd9, 0x6a, 0x60,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x50, 0xb9, 0xbe, 0xb2, 0x69, 0x07, 0x45, 0x5b,
+ 0x59, 0xde, 0x8d, 0xbf, 0x08, 0xdc, 0x2e, 0x7f,
+ 0x93, 0x29, 0xc1, 0x91, 0xe8, 0x74, 0x03, 0x89,
+ 0x20, 0xfb, 0xb2, 0x4b, 0xe8, 0x68, 0x6f, 0xe1,
+ 0xb4, 0x30, 0xbe, 0x11, 0x3c, 0x43, 0x19, 0x66,
+ 0x72, 0x78, 0xb7, 0xf4, 0xe9, 0x09, 0x18, 0x4e,
+ 0xae, 0x4a, 0x24, 0xe0, 0x6f, 0x44, 0x02, 0xe3,
+ 0xfd, 0xda, 0xb3, 0x3e, 0x3c, 0x6d, 0x54, 0x2e,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0xd6, 0xf2, 0xa9, 0x61, 0x3f, 0xce, 0x2a, 0x68,
+ 0x19, 0x86, 0xff, 0xd1, 0xee, 0x89, 0x3b, 0xa4,
+ 0x10, 0x9a, 0x91, 0x50, 0x35, 0x48, 0x9e, 0xf5,
+ 0x9c, 0x95, 0xe0, 0xfb, 0x92, 0x0f, 0xa8, 0xf7,
+ 0x6c, 0x43, 0x85, 0xf1, 0x6e, 0x11, 0x4e, 0x67,
+ 0x78, 0xd7, 0x53, 0x25, 0x0c, 0xf8, 0xce, 0x38,
+ 0x74, 0x08, 0xb0, 0x3c, 0x53, 0x20, 0x4d, 0xc4,
+ 0x9a, 0xf5, 0x78, 0xe8, 0x41, 0x8f, 0xed, 0x1f,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0xe8, 0xb2, 0xc5, 0xa7, 0xf5, 0xfa, 0xee, 0xa0,
+ 0x57, 0xba, 0x58, 0xf9, 0x0a, 0xf2, 0x64, 0x16,
+ 0xa8, 0xa6, 0x03, 0x85, 0x3b, 0xb8, 0x6f, 0xca,
+ 0x76, 0xc3, 0xa1, 0x2b, 0xec, 0xef, 0xc4, 0x66,
+ 0x11, 0xdf, 0x03, 0x85, 0x9d, 0x0c, 0x37, 0x7b,
+ 0xa9, 0x7b, 0x44, 0xfb, 0x11, 0x8f, 0x3f, 0x71,
+ 0xcd, 0x81, 0x43, 0x2e, 0x71, 0x5c, 0x54, 0x9f,
+ 0xca, 0x0f, 0x01, 0x91, 0xca, 0xaa, 0x93, 0xe9,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x05, 0x8e, 0x9d, 0xdc, 0xe9, 0x36, 0x3e, 0x73,
+ 0x63, 0x59, 0x69, 0x81, 0x0b, 0x8c, 0xc7, 0x9e,
+ 0xcc, 0xe7, 0x9c, 0x19, 0x54, 0xa7, 0x2f, 0x86,
+ 0xb5, 0xea, 0xae, 0x6d, 0xfe, 0x4e, 0x6e, 0x83,
+ 0x8d, 0x1a, 0x1c, 0x70, 0x3f, 0x34, 0xa1, 0x04,
+ 0x59, 0xd1, 0xbb, 0xaa, 0x58, 0xf7, 0xce, 0xfb,
+ 0x86, 0x66, 0x22, 0xfc, 0x78, 0x74, 0x6e, 0x85,
+ 0xf1, 0x59, 0x7d, 0x9e, 0x1c, 0x3b, 0xc6, 0x65,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x6b, 0x1f, 0x7c, 0x9a, 0x65, 0x7f, 0x09, 0x61,
+ 0xe5, 0x04, 0x9a, 0xf1, 0x4b, 0x36, 0x8e, 0x41,
+ 0x86, 0xcf, 0x86, 0x19, 0xd8, 0xc9, 0x34, 0x70,
+ 0x67, 0xd1, 0x03, 0x72, 0x12, 0xf7, 0x27, 0x92,
+ 0x2e, 0x3d, 0x2b, 0x54, 0x9a, 0x48, 0xa4, 0xc2,
+ 0x61, 0xea, 0x6a, 0xe8, 0xdd, 0x07, 0x41, 0x85,
+ 0x58, 0x6d, 0xcd, 0x12, 0x0d, 0xbc, 0xb1, 0x23,
+ 0xb2, 0xdb, 0x24, 0x1f, 0xc4, 0xa7, 0xae, 0xda,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x50, 0xd8, 0xdc, 0xb2, 0x50, 0x24, 0x7a, 0x49,
+ 0xb1, 0x00, 0x73, 0x16, 0x1f, 0xce, 0xf9, 0xe8,
+ 0x77, 0x0a, 0x27, 0x74, 0xc7, 0xeb, 0xf0, 0x62,
+ 0xb9, 0xf3, 0x24, 0xa6, 0x03, 0x18, 0x40, 0xde,
+ 0x9b, 0x1d, 0xa8, 0xd0, 0xbf, 0x66, 0xa3, 0xc1,
+ 0x31, 0x04, 0x95, 0xc7, 0xc3, 0xb7, 0x11, 0xe2,
+ 0x1e, 0x31, 0x49, 0x98, 0x06, 0xab, 0xf0, 0xe6,
+ 0x5c, 0xac, 0x88, 0x28, 0x0b, 0x3d, 0xb2, 0xc2,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0xd4, 0x2b, 0x6b, 0x9e, 0xfc, 0x44, 0xc0, 0x90,
+ 0x64, 0x77, 0x5d, 0xf3, 0x44, 0xb6, 0x92, 0x8f,
+ 0x80, 0xe2, 0xe4, 0x9b, 0xaf, 0x49, 0x04, 0xea,
+ 0x29, 0xf7, 0x4a, 0x33, 0x3f, 0xc7, 0x3b, 0xab,
+ 0xa1, 0x71, 0x7f, 0xa2, 0x8e, 0x03, 0xa0, 0xd6,
+ 0xa7, 0xcd, 0xe0, 0xf8, 0xd7, 0x3b, 0xa4, 0x0d,
+ 0x84, 0x79, 0x12, 0x72, 0x3f, 0x8e, 0x48, 0x35,
+ 0x76, 0x4f, 0x56, 0xe9, 0x21, 0x40, 0x19, 0xbe,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x84, 0xd4, 0xd8, 0x6c, 0x60, 0x3d, 0x6e, 0xfd,
+ 0x84, 0xb7, 0xdf, 0xba, 0x13, 0x5e, 0x07, 0x94,
+ 0x5b, 0x6b, 0x62, 0x1d, 0x82, 0x02, 0xa7, 0xb3,
+ 0x21, 0xdf, 0x42, 0x20, 0x85, 0xa8, 0x6f, 0x30,
+ 0xf7, 0x03, 0xba, 0x66, 0x0e, 0xa6, 0x42, 0x21,
+ 0x37, 0xe8, 0xed, 0x5b, 0x22, 0xf5, 0x4e, 0xa5,
+ 0xe5, 0x80, 0x1b, 0x47, 0xf0, 0x49, 0xb3, 0xe5,
+ 0x6e, 0xd9, 0xd9, 0x95, 0x3d, 0x2e, 0x42, 0x13,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x71, 0x17, 0xab, 0x93, 0xfe, 0x3b, 0xa4, 0xe6,
+ 0xcb, 0xb0, 0xea, 0x95, 0xe7, 0x1a, 0x01, 0xc0,
+ 0x12, 0x33, 0xfe, 0xcc, 0x79, 0x15, 0xae, 0x56,
+ 0xd2, 0x70, 0x44, 0x60, 0x54, 0x42, 0xa8, 0x69,
+ 0x7e, 0xc3, 0x90, 0xa0, 0x0c, 0x63, 0x39, 0xff,
+ 0x55, 0x53, 0xb8, 0x46, 0xef, 0x06, 0xcb, 0xba,
+ 0x73, 0xf4, 0x76, 0x22, 0xf1, 0x60, 0x98, 0xbc,
+ 0xbf, 0x76, 0x95, 0x85, 0x13, 0x1d, 0x11, 0x3b,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x3a, 0xaa, 0x85, 0xa0, 0x8c, 0x8e, 0xe1, 0x9c,
+ 0x9b, 0x43, 0x72, 0x7f, 0x40, 0x88, 0x3b, 0xd1,
+ 0xc4, 0xd8, 0x2b, 0x69, 0xa6, 0x74, 0x47, 0x69,
+ 0x5f, 0x7d, 0xab, 0x75, 0xa9, 0xf9, 0x88, 0x54,
+ 0xce, 0x57, 0xcc, 0x9d, 0xac, 0x13, 0x91, 0xdb,
+ 0x6d, 0x5c, 0xd8, 0xf4, 0x35, 0xc9, 0x30, 0xf0,
+ 0x4b, 0x91, 0x25, 0xab, 0x92, 0xa8, 0xc8, 0x6f,
+ 0xa0, 0xeb, 0x71, 0x56, 0x95, 0xab, 0xfd, 0xd7,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0xe1, 0xe9, 0xbe, 0x6c, 0x96, 0xe2, 0xe8, 0xa6,
+ 0x53, 0xcd, 0x79, 0x77, 0x57, 0x51, 0x2f, 0xb2,
+ 0x9f, 0xfc, 0x09, 0xaa, 0x2c, 0xbc, 0x6c, 0x5f,
+ 0xb0, 0xf2, 0x12, 0x39, 0x54, 0xd7, 0x27, 0xf8,
+ 0x33, 0x5d, 0xd4, 0x8a, 0xca, 0xd8, 0x2e, 0xbb,
+ 0x02, 0x82, 0xca, 0x1b, 0x54, 0xfa, 0xd6, 0xf4,
+ 0x49, 0x63, 0xfc, 0xc8, 0x73, 0xd4, 0x26, 0x8d,
+ 0x4f, 0x1c, 0x56, 0xa7, 0xf4, 0x58, 0x6f, 0x51,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0xf2, 0xf6, 0xe1, 0x16, 0x98, 0x69, 0x74, 0x5f,
+ 0x6c, 0xc4, 0x9d, 0x34, 0xa2, 0x84, 0x5d, 0x47,
+ 0xac, 0x39, 0xe0, 0x14, 0x2d, 0x78, 0xfa, 0x27,
+ 0xd5, 0x18, 0xaf, 0x26, 0x89, 0xa4, 0x69, 0xd3,
+ 0x56, 0xde, 0xfe, 0x4b, 0x9f, 0x0c, 0x9d, 0x5a,
+ 0x9a, 0x73, 0x3e, 0x3c, 0x76, 0x4b, 0x96, 0xca,
+ 0x49, 0xda, 0x05, 0x8c, 0x53, 0xbb, 0x85, 0x89,
+ 0x60, 0xc7, 0xe0, 0xb3, 0x51, 0x18, 0xd2, 0xd2,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0xfc, 0x5c, 0xcf, 0xbf, 0x29, 0xe3, 0x01, 0xef,
+ 0x4b, 0x40, 0x70, 0x01, 0xca, 0x4d, 0x46, 0xce,
+ 0xa9, 0x95, 0x5d, 0xb4, 0xf1, 0x79, 0x29, 0xdb,
+ 0xac, 0x32, 0x3d, 0xd9, 0x60, 0x9e, 0x6b, 0xb8,
+ 0x28, 0x62, 0xb7, 0x4a, 0xbb, 0x33, 0xb9, 0xd0,
+ 0x83, 0xe0, 0xd7, 0x5a, 0x2d, 0x01, 0x4c, 0x61,
+ 0x9e, 0x7d, 0x2d, 0x2d, 0x60, 0x29, 0x5e, 0x60,
+ 0x10, 0xb7, 0x41, 0x00, 0x3f, 0xe5, 0xf7, 0x52,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0xf8, 0xe5, 0x4b, 0xe5, 0x89, 0xf9, 0x1b, 0x43,
+ 0xbb, 0x65, 0x3d, 0xa0, 0xb4, 0xdc, 0x04, 0x26,
+ 0x68, 0x15, 0xae, 0x4d, 0xd6, 0x03, 0xb7, 0x27,
+ 0x06, 0x8c, 0x2a, 0x82, 0x51, 0x96, 0xbf, 0x83,
+ 0x38, 0x96, 0x21, 0x8a, 0xd9, 0xf9, 0x4e, 0x38,
+ 0xc6, 0xb3, 0xbd, 0xfe, 0xd3, 0x49, 0x90, 0xbc,
+ 0xa1, 0x77, 0xd0, 0xa0, 0x3c, 0x2b, 0x4e, 0x10,
+ 0x34, 0xc3, 0x17, 0x85, 0x3d, 0xec, 0xa8, 0x05,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0x38, 0x56, 0xaf, 0x83, 0x68, 0x9c, 0xba, 0xe3,
+ 0xec, 0x51, 0xf5, 0xf4, 0x93, 0x48, 0x1d, 0xe6,
+ 0xad, 0xa8, 0x8c, 0x70, 0x2a, 0xd9, 0xaa, 0x43,
+ 0x04, 0x40, 0x95, 0xc1, 0xe6, 0x8a, 0xf5, 0x01,
+ 0x6b, 0x79, 0xd9, 0xb4, 0xd0, 0x1d, 0x93, 0x26,
+ 0xfe, 0xf5, 0x07, 0x57, 0xda, 0x08, 0x0a, 0x82,
+ 0xc9, 0x17, 0x13, 0x5b, 0x9e, 0x11, 0x96, 0xa5,
+ 0xd0, 0x92, 0xcd, 0xf1, 0xa3, 0x5b, 0x43, 0x21,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[BLAKE2B_HASH_SIZE] = {
+ 0xa4, 0xf8, 0xf6, 0xa1, 0x36, 0x89, 0xc0, 0x2a,
+ 0xc3, 0x42, 0x32, 0x71, 0xe5, 0xea, 0x14, 0x77,
+ 0xf3, 0x99, 0x91, 0x87, 0x49, 0xc2, 0x8d, 0xa5,
+ 0x2f, 0xed, 0x01, 0x35, 0x39, 0x64, 0x09, 0x25,
+ 0xe3, 0xa8, 0x50, 0x97, 0x35, 0x8b, 0xf5, 0x19,
+ 0x1e, 0xd5, 0x9f, 0x03, 0x0b, 0x65, 0x55, 0x0e,
+ 0xa0, 0xb7, 0xda, 0x18, 0x7b, 0x7f, 0x88, 0x55,
+ 0x1f, 0xdb, 0x82, 0x6b, 0x98, 0x90, 0x1c, 0xdd,
+};
+
+static const u8 blake2b_keyed_testvec_consolidated[BLAKE2B_HASH_SIZE] = {
+ 0x2b, 0x89, 0x36, 0x3a, 0x36, 0xe4, 0x18, 0x38,
+ 0xc4, 0x5b, 0x5c, 0xa5, 0x9a, 0xed, 0xf2, 0xee,
+ 0x5a, 0xb6, 0x82, 0x6c, 0x63, 0xf2, 0x29, 0x57,
+ 0xc7, 0xd5, 0x32, 0x27, 0xba, 0x88, 0xb1, 0xab,
+ 0xf2, 0x2a, 0xc1, 0xea, 0xf3, 0x91, 0x89, 0x66,
+ 0x47, 0x1e, 0x5b, 0xc6, 0x98, 0x12, 0xe9, 0x25,
+ 0xbf, 0x72, 0xd2, 0x3f, 0x88, 0x97, 0x17, 0x51,
+ 0xed, 0x96, 0xfb, 0xe9, 0xca, 0x52, 0x42, 0xc9,
+};
diff --git a/lib/crypto/tests/blake2b_kunit.c b/lib/crypto/tests/blake2b_kunit.c
new file mode 100644
index 000000000000..bc0be7da1e76
--- /dev/null
+++ b/lib/crypto/tests/blake2b_kunit.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2b.h>
+#include "blake2b-testvecs.h"
+
+/*
+ * The following are compatibility functions that present BLAKE2b as an unkeyed
+ * hash function that produces hashes of fixed length BLAKE2B_HASH_SIZE, so that
+ * hash-test-template.h can be reused to test it.
+ */
+
+static void blake2b_default(const u8 *data, size_t len,
+ u8 out[BLAKE2B_HASH_SIZE])
+{
+ blake2b(NULL, 0, data, len, out, BLAKE2B_HASH_SIZE);
+}
+
+static void blake2b_init_default(struct blake2b_ctx *ctx)
+{
+ blake2b_init(ctx, BLAKE2B_HASH_SIZE);
+}
+
+/*
+ * Generate the HASH_KUNIT_CASES using hash-test-template.h. These test BLAKE2b
+ * with a key length of 0 and a hash length of BLAKE2B_HASH_SIZE.
+ */
+#define HASH blake2b_default
+#define HASH_CTX blake2b_ctx
+#define HASH_SIZE BLAKE2B_HASH_SIZE
+#define HASH_INIT blake2b_init_default
+#define HASH_UPDATE blake2b_update
+#define HASH_FINAL blake2b_final
+#include "hash-test-template.h"
+
+/*
+ * BLAKE2b specific test case which tests all possible combinations of key
+ * length and hash length.
+ */
+static void test_blake2b_all_key_and_hash_lens(struct kunit *test)
+{
+ const size_t data_len = 100;
+ u8 *data = &test_buf[0];
+ u8 *key = data + data_len;
+ u8 *hash = key + BLAKE2B_KEY_SIZE;
+ struct blake2b_ctx main_ctx;
+ u8 main_hash[BLAKE2B_HASH_SIZE];
+
+ rand_bytes_seeded_from_len(data, data_len);
+ blake2b_init(&main_ctx, BLAKE2B_HASH_SIZE);
+ for (int key_len = 0; key_len <= BLAKE2B_KEY_SIZE; key_len++) {
+ rand_bytes_seeded_from_len(key, key_len);
+ for (int out_len = 1; out_len <= BLAKE2B_HASH_SIZE; out_len++) {
+ blake2b(key, key_len, data, data_len, hash, out_len);
+ blake2b_update(&main_ctx, hash, out_len);
+ }
+ }
+ blake2b_final(&main_ctx, main_hash);
+ KUNIT_ASSERT_MEMEQ(test, main_hash, blake2b_keyed_testvec_consolidated,
+ BLAKE2B_HASH_SIZE);
+}
+
+/*
+ * BLAKE2b specific test case which tests using a guarded buffer for all allowed
+ * key lengths. Also tests both blake2b() and blake2b_init_key().
+ */
+static void test_blake2b_with_guarded_key_buf(struct kunit *test)
+{
+ const size_t data_len = 100;
+
+ rand_bytes(test_buf, data_len);
+ for (int key_len = 0; key_len <= BLAKE2B_KEY_SIZE; key_len++) {
+ u8 key[BLAKE2B_KEY_SIZE];
+ u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
+ u8 hash1[BLAKE2B_HASH_SIZE];
+ u8 hash2[BLAKE2B_HASH_SIZE];
+ struct blake2b_ctx ctx;
+
+ rand_bytes(key, key_len);
+ memcpy(guarded_key, key, key_len);
+
+ blake2b(key, key_len, test_buf, data_len,
+ hash1, BLAKE2B_HASH_SIZE);
+ blake2b(guarded_key, key_len, test_buf, data_len,
+ hash2, BLAKE2B_HASH_SIZE);
+ KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2B_HASH_SIZE);
+
+ blake2b_init_key(&ctx, BLAKE2B_HASH_SIZE, guarded_key, key_len);
+ blake2b_update(&ctx, test_buf, data_len);
+ blake2b_final(&ctx, hash2);
+ KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2B_HASH_SIZE);
+ }
+}
+
+/*
+ * BLAKE2b specific test case which tests using a guarded output buffer for all
+ * allowed output lengths.
+ */
+static void test_blake2b_with_guarded_out_buf(struct kunit *test)
+{
+ const size_t data_len = 100;
+
+ rand_bytes(test_buf, data_len);
+ for (int out_len = 1; out_len <= BLAKE2B_HASH_SIZE; out_len++) {
+ u8 hash[BLAKE2B_HASH_SIZE];
+ u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
+
+ blake2b(NULL, 0, test_buf, data_len, hash, out_len);
+ blake2b(NULL, 0, test_buf, data_len, guarded_hash, out_len);
+ KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
+ }
+}
+
+static struct kunit_case blake2b_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_blake2b_all_key_and_hash_lens),
+ KUNIT_CASE(test_blake2b_with_guarded_key_buf),
+ KUNIT_CASE(test_blake2b_with_guarded_out_buf),
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite blake2b_test_suite = {
+ .name = "blake2b",
+ .test_cases = blake2b_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(blake2b_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for BLAKE2b");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/blake2s-testvecs.h b/lib/crypto/tests/blake2s-testvecs.h
new file mode 100644
index 000000000000..6f978b79a59b
--- /dev/null
+++ b/lib/crypto/tests/blake2s-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py blake2s */
+
+static const struct {
+ size_t data_len;
+ u8 digest[BLAKE2S_HASH_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0x69, 0x21, 0x7a, 0x30, 0x79, 0x90, 0x80, 0x94,
+ 0xe1, 0x11, 0x21, 0xd0, 0x42, 0x35, 0x4a, 0x7c,
+ 0x1f, 0x55, 0xb6, 0x48, 0x2c, 0xa1, 0xa5, 0x1e,
+ 0x1b, 0x25, 0x0d, 0xfd, 0x1e, 0xd0, 0xee, 0xf9,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x7c, 0xab, 0x53, 0xe2, 0x48, 0x87, 0xdf, 0x64,
+ 0x98, 0x6a, 0xc1, 0x7e, 0xf0, 0x01, 0x4d, 0xc9,
+ 0x07, 0x4f, 0xb8, 0x2f, 0x46, 0xd7, 0xee, 0xa9,
+ 0xad, 0xe5, 0xf8, 0x21, 0xac, 0xfe, 0x17, 0x58,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x5e, 0x63, 0x2c, 0xd0, 0xf8, 0x7b, 0xf5, 0xae,
+ 0x61, 0x97, 0x94, 0x57, 0xc8, 0x76, 0x22, 0xd9,
+ 0x8b, 0x04, 0x5e, 0xf1, 0x5d, 0xd0, 0xfc, 0xd9,
+ 0x0c, 0x19, 0x2e, 0xe2, 0xc5, 0xd9, 0x73, 0x51,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x33, 0x65, 0xa6, 0x37, 0xbf, 0xf8, 0x4f, 0x15,
+ 0x4c, 0xac, 0x9e, 0xa4, 0x3b, 0x02, 0x07, 0x0c,
+ 0x80, 0x86, 0x0d, 0x6c, 0xe4, 0xaf, 0x1c, 0xbc,
+ 0x0b, 0x9c, 0x0a, 0x98, 0xc2, 0x99, 0x71, 0xcd,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x59, 0xd2, 0x10, 0xd3, 0x75, 0xac, 0x48, 0x32,
+ 0xb1, 0xea, 0xee, 0xcf, 0x0a, 0xd2, 0x8b, 0x15,
+ 0x5d, 0x72, 0x71, 0x4c, 0xa7, 0x29, 0xb0, 0x7a,
+ 0x44, 0x48, 0x8a, 0x54, 0x54, 0x54, 0x41, 0xf5,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0xdc, 0xfc, 0x46, 0x81, 0xc6, 0x1b, 0x2b, 0x47,
+ 0x8b, 0xed, 0xe0, 0x73, 0x34, 0x38, 0x53, 0x92,
+ 0x97, 0x2f, 0xfb, 0x51, 0xab, 0x4f, 0x2d, 0x9d,
+ 0x69, 0x04, 0xa9, 0x5d, 0x33, 0xef, 0xcb, 0x1c,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0xd6, 0x2a, 0x7f, 0x96, 0x04, 0x4d, 0x16, 0xc8,
+ 0x49, 0xe0, 0x37, 0x33, 0xe3, 0x7b, 0x34, 0x56,
+ 0x99, 0xc5, 0x78, 0x57, 0x06, 0x02, 0xb4, 0xea,
+ 0x80, 0xc4, 0xf8, 0x8f, 0x8d, 0x2b, 0xe4, 0x05,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x8b, 0x58, 0x62, 0xb5, 0x85, 0xf6, 0x83, 0x36,
+ 0xf5, 0x34, 0xb8, 0xd4, 0xbc, 0x5c, 0x8b, 0x38,
+ 0xfd, 0x15, 0xcd, 0x44, 0x83, 0x25, 0x71, 0xe1,
+ 0xd5, 0xe8, 0xa1, 0xa4, 0x36, 0x98, 0x7e, 0x68,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x7e, 0xeb, 0x06, 0x87, 0xdf, 0x1a, 0xdc, 0xe5,
+ 0xfb, 0x64, 0xd4, 0xd1, 0x5d, 0x9e, 0x75, 0xc0,
+ 0xb9, 0xad, 0x55, 0x6c, 0xe6, 0xba, 0x4d, 0x98,
+ 0x2f, 0xbf, 0x72, 0xad, 0x61, 0x37, 0xf6, 0x11,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x72, 0xdb, 0x43, 0x16, 0x57, 0x8e, 0x3a, 0x96,
+ 0xf3, 0x98, 0x19, 0x24, 0x17, 0x3b, 0xe8, 0xad,
+ 0xa1, 0x9b, 0xa4, 0x1b, 0x74, 0x85, 0x2e, 0x24,
+ 0x70, 0xea, 0x31, 0x5a, 0x1c, 0xbe, 0x43, 0xb5,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x32, 0x48, 0xb0, 0xf0, 0x3f, 0xbb, 0xd2, 0xa3,
+ 0xfd, 0xf6, 0x28, 0x4a, 0x2a, 0xc5, 0xbe, 0x4b,
+ 0x73, 0x50, 0x63, 0xd6, 0x16, 0x00, 0xef, 0xed,
+ 0xfe, 0x97, 0x41, 0x29, 0xb2, 0x84, 0xc4, 0xa3,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x17, 0xda, 0x6b, 0x96, 0x6a, 0xa6, 0xa4, 0xa6,
+ 0xa6, 0xf3, 0x9d, 0x18, 0x19, 0x8d, 0x98, 0x7c,
+ 0x66, 0x38, 0xe8, 0x99, 0xe7, 0x0a, 0x50, 0x92,
+ 0xaf, 0x11, 0x80, 0x05, 0x66, 0xed, 0xab, 0x74,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x13, 0xd5, 0x8b, 0x22, 0xae, 0x90, 0x7b, 0x67,
+ 0x87, 0x4e, 0x3c, 0x35, 0x4e, 0x01, 0xf0, 0xb1,
+ 0xd3, 0xd1, 0x67, 0xbb, 0x43, 0xdb, 0x7c, 0x75,
+ 0xa4, 0xc7, 0x64, 0x83, 0x1e, 0x9b, 0x98, 0xad,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x6f, 0xe0, 0x5d, 0x9d, 0xd5, 0x78, 0x29, 0xfb,
+ 0xd0, 0x77, 0xd1, 0x8a, 0xf0, 0x80, 0xcb, 0x81,
+ 0x71, 0x9e, 0x4d, 0x49, 0xde, 0x74, 0x2a, 0x37,
+ 0xc0, 0xd5, 0xf0, 0xfa, 0x50, 0xe6, 0x23, 0xfe,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x89, 0xac, 0xf6, 0xe7, 0x5e, 0xba, 0x53, 0xf4,
+ 0x92, 0x32, 0xd5, 0x64, 0xfb, 0xc4, 0x08, 0xac,
+ 0x2c, 0x19, 0x6e, 0x63, 0x13, 0x75, 0xd0, 0x60,
+ 0x54, 0x35, 0x82, 0xc4, 0x6d, 0x03, 0x1a, 0x05,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x1c, 0xaf, 0x94, 0x7d, 0x9c, 0xce, 0x57, 0x64,
+ 0xf8, 0xa8, 0x25, 0x45, 0x32, 0x86, 0x2b, 0x04,
+ 0xb3, 0x2e, 0x67, 0xca, 0x73, 0x04, 0x2f, 0xab,
+ 0xcc, 0xda, 0x9e, 0x42, 0xa1, 0xaf, 0x83, 0x5a,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x21, 0xdf, 0xdc, 0x29, 0xd9, 0xfc, 0x7b, 0xe7,
+ 0x3a, 0xc4, 0xe1, 0x61, 0xc5, 0xb5, 0xe1, 0xee,
+ 0x7a, 0x9d, 0x0c, 0x66, 0x36, 0x63, 0xe4, 0x12,
+ 0x62, 0xe2, 0xf5, 0x68, 0x72, 0xfc, 0x1e, 0x18,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x6e, 0xc7, 0x2e, 0xac, 0xd0, 0xbb, 0x22, 0xe0,
+ 0xc2, 0x40, 0xb2, 0xfe, 0x8c, 0xaf, 0x9e, 0xcf,
+ 0x32, 0x06, 0xc6, 0x45, 0x29, 0xbd, 0xe0, 0x7f,
+ 0x53, 0x32, 0xc3, 0x2b, 0x2f, 0x68, 0x12, 0xcd,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x76, 0xba, 0x52, 0xb5, 0x09, 0xf5, 0x19, 0x09,
+ 0x70, 0x1c, 0x09, 0x28, 0xb4, 0xaa, 0x98, 0x6a,
+ 0x79, 0xe7, 0x5e, 0xcd, 0xe8, 0xa4, 0x73, 0x69,
+ 0x1f, 0xf8, 0x05, 0x0a, 0xb4, 0xfe, 0xf9, 0x63,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0xf7, 0xad, 0xf9, 0xc8, 0x0e, 0x04, 0x2f, 0xdf,
+ 0xbe, 0x39, 0x79, 0x07, 0x0d, 0xd8, 0x1b, 0x06,
+ 0x42, 0x3a, 0x43, 0x93, 0xf6, 0x7c, 0xc4, 0xe5,
+ 0xc2, 0xd5, 0xd0, 0xa6, 0x35, 0x6c, 0xbd, 0x17,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x38, 0xd7, 0xab, 0x7e, 0x08, 0xdc, 0x1e, 0xab,
+ 0x55, 0xbb, 0x3b, 0x7b, 0x6a, 0x17, 0xcc, 0x79,
+ 0xa7, 0x02, 0x62, 0x66, 0x9b, 0xca, 0xee, 0xc0,
+ 0x3d, 0x75, 0x34, 0x2e, 0x55, 0x82, 0x26, 0x3c,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0xf7, 0xeb, 0x2f, 0x24, 0x98, 0x54, 0x04, 0x5a,
+ 0x19, 0xe4, 0x12, 0x9d, 0x97, 0xbc, 0x87, 0xa5,
+ 0x0b, 0x85, 0x29, 0xa1, 0x36, 0x89, 0xc9, 0xba,
+ 0xa0, 0xe0, 0xac, 0x99, 0x7d, 0xa4, 0x51, 0x9f,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x8f, 0xe8, 0xa7, 0x79, 0x02, 0xbb, 0x4a, 0x56,
+ 0x66, 0x91, 0xef, 0x22, 0xd1, 0x09, 0x26, 0x6c,
+ 0xa9, 0x13, 0xd7, 0x44, 0xc7, 0x19, 0x9c, 0x0b,
+ 0xfb, 0x4f, 0xca, 0x72, 0x8f, 0x34, 0xf7, 0x82,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xaa, 0x21, 0xbb, 0x25, 0x4b, 0x66, 0x6e, 0x29,
+ 0x71, 0xc1, 0x44, 0x67, 0x19, 0xed, 0xe6, 0xe6,
+ 0x61, 0x13, 0xf4, 0xb7, 0x02, 0x94, 0x81, 0x0f,
+ 0xa7, 0x4d, 0xbb, 0x2c, 0xb8, 0xeb, 0x41, 0x0e,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[BLAKE2S_HASH_SIZE] = {
+ 0x84, 0x21, 0xbb, 0x73, 0x64, 0x47, 0x45, 0xe0,
+ 0xc1, 0x83, 0x78, 0xf1, 0xea, 0xe5, 0xfd, 0xdb,
+ 0x01, 0xda, 0xb7, 0x86, 0x70, 0x3b, 0x83, 0xb3,
+ 0xbc, 0xd9, 0xfd, 0x96, 0xbd, 0x50, 0x06, 0x67,
+};
+
+static const u8 blake2s_keyed_testvec_consolidated[BLAKE2S_HASH_SIZE] = {
+ 0xa6, 0xad, 0xcd, 0xb8, 0xd9, 0xdd, 0xc7, 0x70,
+ 0x07, 0x09, 0x7f, 0x9f, 0x41, 0xa9, 0x70, 0xa4,
+ 0x1c, 0xca, 0x61, 0xbb, 0x58, 0xb5, 0xb2, 0x1d,
+ 0xd1, 0x71, 0x16, 0xb0, 0x49, 0x4f, 0x9e, 0x1b,
+};
diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c
new file mode 100644
index 000000000000..6832d9aa7b82
--- /dev/null
+++ b/lib/crypto/tests/blake2s_kunit.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2s.h>
+#include "blake2s-testvecs.h"
+
+/*
+ * The following are compatibility functions that present BLAKE2s as an unkeyed
+ * hash function that produces hashes of fixed length BLAKE2S_HASH_SIZE, so that
+ * hash-test-template.h can be reused to test it.
+ */
+
+static void blake2s_default(const u8 *data, size_t len,
+ u8 out[BLAKE2S_HASH_SIZE])
+{
+ blake2s(NULL, 0, data, len, out, BLAKE2S_HASH_SIZE);
+}
+
+static void blake2s_init_default(struct blake2s_ctx *ctx)
+{
+ blake2s_init(ctx, BLAKE2S_HASH_SIZE);
+}
+
+/*
+ * Generate the HASH_KUNIT_CASES using hash-test-template.h. These test BLAKE2s
+ * with a key length of 0 and a hash length of BLAKE2S_HASH_SIZE.
+ */
+#define HASH blake2s_default
+#define HASH_CTX blake2s_ctx
+#define HASH_SIZE BLAKE2S_HASH_SIZE
+#define HASH_INIT blake2s_init_default
+#define HASH_UPDATE blake2s_update
+#define HASH_FINAL blake2s_final
+#include "hash-test-template.h"
+
+/*
+ * BLAKE2s specific test case which tests all possible combinations of key
+ * length and hash length.
+ */
+static void test_blake2s_all_key_and_hash_lens(struct kunit *test)
+{
+ const size_t data_len = 100;
+ u8 *data = &test_buf[0];
+ u8 *key = data + data_len;
+ u8 *hash = key + BLAKE2S_KEY_SIZE;
+ struct blake2s_ctx main_ctx;
+ u8 main_hash[BLAKE2S_HASH_SIZE];
+
+ rand_bytes_seeded_from_len(data, data_len);
+ blake2s_init(&main_ctx, BLAKE2S_HASH_SIZE);
+ for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
+ rand_bytes_seeded_from_len(key, key_len);
+ for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
+ blake2s(key, key_len, data, data_len, hash, out_len);
+ blake2s_update(&main_ctx, hash, out_len);
+ }
+ }
+ blake2s_final(&main_ctx, main_hash);
+ KUNIT_ASSERT_MEMEQ(test, main_hash, blake2s_keyed_testvec_consolidated,
+ BLAKE2S_HASH_SIZE);
+}
+
+/*
+ * BLAKE2s specific test case which tests using a guarded buffer for all allowed
+ * key lengths. Also tests both blake2s() and blake2s_init_key().
+ */
+static void test_blake2s_with_guarded_key_buf(struct kunit *test)
+{
+ const size_t data_len = 100;
+
+ rand_bytes(test_buf, data_len);
+ for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
+ u8 key[BLAKE2S_KEY_SIZE];
+ u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
+ u8 hash1[BLAKE2S_HASH_SIZE];
+ u8 hash2[BLAKE2S_HASH_SIZE];
+ struct blake2s_ctx ctx;
+
+ rand_bytes(key, key_len);
+ memcpy(guarded_key, key, key_len);
+
+ blake2s(key, key_len, test_buf, data_len,
+ hash1, BLAKE2S_HASH_SIZE);
+ blake2s(guarded_key, key_len, test_buf, data_len,
+ hash2, BLAKE2S_HASH_SIZE);
+ KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
+
+ blake2s_init_key(&ctx, BLAKE2S_HASH_SIZE, guarded_key, key_len);
+ blake2s_update(&ctx, test_buf, data_len);
+ blake2s_final(&ctx, hash2);
+ KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
+ }
+}
+
+/*
+ * BLAKE2s specific test case which tests using a guarded output buffer for all
+ * allowed output lengths.
+ */
+static void test_blake2s_with_guarded_out_buf(struct kunit *test)
+{
+ const size_t data_len = 100;
+
+ rand_bytes(test_buf, data_len);
+ for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
+ u8 hash[BLAKE2S_HASH_SIZE];
+ u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
+
+ blake2s(NULL, 0, test_buf, data_len, hash, out_len);
+ blake2s(NULL, 0, test_buf, data_len, guarded_hash, out_len);
+ KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
+ }
+}
+
+static struct kunit_case blake2s_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_blake2s_all_key_and_hash_lens),
+ KUNIT_CASE(test_blake2s_with_guarded_key_buf),
+ KUNIT_CASE(test_blake2s_with_guarded_out_buf),
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite blake2s_test_suite = {
+ .name = "blake2s",
+ .test_cases = blake2s_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(blake2s_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for BLAKE2s");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/curve25519-selftest.c b/lib/crypto/tests/curve25519_kunit.c
index c85e85381e78..248d05f66b35 100644
--- a/lib/crypto/curve25519-selftest.c
+++ b/lib/crypto/tests/curve25519_kunit.c
@@ -4,6 +4,8 @@
*/
#include <crypto/curve25519.h>
+#include <kunit/test.h>
+#include <linux/timekeeping.h>
struct curve25519_test_vector {
u8 private[CURVE25519_KEY_SIZE];
@@ -11,7 +13,7 @@ struct curve25519_test_vector {
u8 result[CURVE25519_KEY_SIZE];
bool valid;
};
-static const struct curve25519_test_vector curve25519_test_vectors[] __initconst = {
+static const struct curve25519_test_vector curve25519_test_vectors[] = {
{
.private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d,
0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45,
@@ -1280,42 +1282,82 @@ static const struct curve25519_test_vector curve25519_test_vectors[] __initconst
}
};
-bool __init curve25519_selftest(void)
+static void test_curve25519(struct kunit *test)
{
- bool success = true, ret, ret2;
- size_t i = 0, j;
- u8 in[CURVE25519_KEY_SIZE];
- u8 out[CURVE25519_KEY_SIZE], out2[CURVE25519_KEY_SIZE],
- out3[CURVE25519_KEY_SIZE];
+ for (size_t i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) {
+ const struct curve25519_test_vector *vec =
+ &curve25519_test_vectors[i];
+ u8 out[CURVE25519_KEY_SIZE] = {};
+ bool ret;
- for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) {
- memset(out, 0, CURVE25519_KEY_SIZE);
- ret = curve25519(out, curve25519_test_vectors[i].private,
- curve25519_test_vectors[i].public);
- if (ret != curve25519_test_vectors[i].valid ||
- memcmp(out, curve25519_test_vectors[i].result,
- CURVE25519_KEY_SIZE)) {
- pr_err("curve25519 self-test %zu: FAIL\n", i + 1);
- success = false;
- }
+ ret = curve25519(out, vec->private, vec->public);
+ KUNIT_EXPECT_EQ_MSG(test, ret, vec->valid,
+ "Wrong return value with test vector %zu",
+ i);
+ KUNIT_EXPECT_MEMEQ_MSG(test, out, vec->result, sizeof(out),
+ "Wrong output with test vector %zu", i);
}
+}
+
+static void test_curve25519_basepoint(struct kunit *test)
+{
+ for (size_t i = 0; i < 5; ++i) {
+ u8 in[CURVE25519_KEY_SIZE];
+ u8 out[CURVE25519_KEY_SIZE];
+ u8 out2[CURVE25519_KEY_SIZE];
+ bool ret, ret2;
- for (i = 0; i < 5; ++i) {
get_random_bytes(in, sizeof(in));
ret = curve25519_generate_public(out, in);
ret2 = curve25519(out2, in, (u8[CURVE25519_KEY_SIZE]){ 9 });
- curve25519_generic(out3, in, (u8[CURVE25519_KEY_SIZE]){ 9 });
- if (ret != ret2 ||
- memcmp(out, out2, CURVE25519_KEY_SIZE) ||
- memcmp(out, out3, CURVE25519_KEY_SIZE)) {
- pr_err("curve25519 basepoint self-test %zu: FAIL: input - 0x",
- i + 1);
- for (j = CURVE25519_KEY_SIZE; j-- > 0;)
- printk(KERN_CONT "%02x", in[j]);
- printk(KERN_CONT "\n");
- success = false;
- }
+ KUNIT_EXPECT_EQ_MSG(test, ret, ret2,
+ "in=%*phN", CURVE25519_KEY_SIZE, in);
+ KUNIT_EXPECT_MEMEQ_MSG(test, out, out2, CURVE25519_KEY_SIZE,
+ "in=%*phN", CURVE25519_KEY_SIZE, in);
}
+}
+
+static void benchmark_curve25519(struct kunit *test)
+{
+ const u8 *private = curve25519_test_vectors[0].private;
+ const u8 *public = curve25519_test_vectors[0].public;
+ const size_t warmup_niter = 5000;
+ const size_t benchmark_niter = 1024;
+ u8 out[CURVE25519_KEY_SIZE];
+ bool ok = true;
+ u64 t;
+
+ if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+ kunit_skip(test, "not enabled");
- return success;
+ /* Warm-up */
+ for (size_t i = 0; i < warmup_niter; i++)
+ ok &= curve25519(out, private, public);
+
+ /* Benchmark */
+ preempt_disable();
+ t = ktime_get_ns();
+ for (size_t i = 0; i < benchmark_niter; i++)
+ ok &= curve25519(out, private, public);
+ t = ktime_get_ns() - t;
+ preempt_enable();
+ KUNIT_EXPECT_TRUE(test, ok);
+ kunit_info(test, "%llu ops/s",
+ div64_u64((u64)benchmark_niter * NSEC_PER_SEC, t ?: 1));
}
+
+static struct kunit_case curve25519_test_cases[] = {
+ KUNIT_CASE(test_curve25519),
+ KUNIT_CASE(test_curve25519_basepoint),
+ KUNIT_CASE(benchmark_curve25519),
+ {},
+};
+
+static struct kunit_suite curve25519_test_suite = {
+ .name = "curve25519",
+ .test_cases = curve25519_test_cases,
+};
+kunit_test_suite(curve25519_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for Curve25519");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/hash-test-template.h b/lib/crypto/tests/hash-test-template.h
new file mode 100644
index 000000000000..61b43e62779f
--- /dev/null
+++ b/lib/crypto/tests/hash-test-template.h
@@ -0,0 +1,568 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Test cases for hash functions, including a benchmark. This is included by
+ * KUnit test suites that want to use it. See sha512_kunit.c for an example.
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <kunit/run-in-irq-context.h>
+#include <kunit/test.h>
+#include <linux/vmalloc.h>
+
+/* test_buf is a guarded buffer, i.e. &test_buf[TEST_BUF_LEN] is not mapped. */
+#define TEST_BUF_LEN 16384
+static u8 *test_buf;
+
+static u8 *orig_test_buf;
+
+static u64 random_seed;
+
+/*
+ * This is a simple linear congruential generator. It is used only for testing,
+ * which does not require cryptographically secure random numbers. A hard-coded
+ * algorithm is used instead of <linux/prandom.h> so that it matches the
+ * algorithm used by the test vector generation script. This allows the input
+ * data in random test vectors to be concisely stored as just the seed.
+ */
+static u32 rand32(void)
+{
+ random_seed = (random_seed * 25214903917 + 11) & ((1ULL << 48) - 1);
+ return random_seed >> 16;
+}
+
+static void rand_bytes(u8 *out, size_t len)
+{
+ for (size_t i = 0; i < len; i++)
+ out[i] = rand32();
+}
+
+static void rand_bytes_seeded_from_len(u8 *out, size_t len)
+{
+ random_seed = len;
+ rand_bytes(out, len);
+}
+
+static bool rand_bool(void)
+{
+ return rand32() % 2;
+}
+
+/* Generate a random length, preferring small lengths. */
+static size_t rand_length(size_t max_len)
+{
+ size_t len;
+
+ switch (rand32() % 3) {
+ case 0:
+ len = rand32() % 128;
+ break;
+ case 1:
+ len = rand32() % 3072;
+ break;
+ default:
+ len = rand32();
+ break;
+ }
+ return len % (max_len + 1);
+}
+
+static size_t rand_offset(size_t max_offset)
+{
+ return min(rand32() % 128, max_offset);
+}
+
+static int hash_suite_init(struct kunit_suite *suite)
+{
+ /*
+ * Allocate the test buffer using vmalloc() with a page-aligned length
+ * so that it is immediately followed by a guard page. This allows
+ * buffer overreads to be detected, even in assembly code.
+ */
+ size_t alloc_len = round_up(TEST_BUF_LEN, PAGE_SIZE);
+
+ orig_test_buf = vmalloc(alloc_len);
+ if (!orig_test_buf)
+ return -ENOMEM;
+
+ test_buf = orig_test_buf + alloc_len - TEST_BUF_LEN;
+ return 0;
+}
+
+static void hash_suite_exit(struct kunit_suite *suite)
+{
+ vfree(orig_test_buf);
+ orig_test_buf = NULL;
+ test_buf = NULL;
+}
+
+/*
+ * Test the hash function against a list of test vectors.
+ *
+ * Note that it's only necessary to run each test vector in one way (e.g.,
+ * one-shot instead of incremental), since consistency between different ways of
+ * using the APIs is verified by other test cases.
+ */
+static void test_hash_test_vectors(struct kunit *test)
+{
+ for (size_t i = 0; i < ARRAY_SIZE(hash_testvecs); i++) {
+ size_t data_len = hash_testvecs[i].data_len;
+ u8 actual_hash[HASH_SIZE];
+
+ KUNIT_ASSERT_LE(test, data_len, TEST_BUF_LEN);
+ rand_bytes_seeded_from_len(test_buf, data_len);
+
+ HASH(test_buf, data_len, actual_hash);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, actual_hash, hash_testvecs[i].digest, HASH_SIZE,
+ "Wrong result with test vector %zu; data_len=%zu", i,
+ data_len);
+ }
+}
+
+/*
+ * Test that the hash function produces correct results for *every* length up to
+ * 4096 bytes. To do this, generate seeded random data, then calculate a hash
+ * value for each length 0..4096, then hash the hash values. Verify just the
+ * final hash value, which should match only when all hash values were correct.
+ */
+static void test_hash_all_lens_up_to_4096(struct kunit *test)
+{
+ struct HASH_CTX ctx;
+ u8 hash[HASH_SIZE];
+
+ static_assert(TEST_BUF_LEN >= 4096);
+ rand_bytes_seeded_from_len(test_buf, 4096);
+ HASH_INIT(&ctx);
+ for (size_t len = 0; len <= 4096; len++) {
+ HASH(test_buf, len, hash);
+ HASH_UPDATE(&ctx, hash, HASH_SIZE);
+ }
+ HASH_FINAL(&ctx, hash);
+ KUNIT_ASSERT_MEMEQ(test, hash, hash_testvec_consolidated, HASH_SIZE);
+}
+
+/*
+ * Test that the hash function produces the same result with a one-shot
+ * computation as it does with an incremental computation.
+ */
+static void test_hash_incremental_updates(struct kunit *test)
+{
+ for (int i = 0; i < 1000; i++) {
+ size_t total_len, offset;
+ struct HASH_CTX ctx;
+ u8 hash1[HASH_SIZE];
+ u8 hash2[HASH_SIZE];
+ size_t num_parts = 0;
+ size_t remaining_len, cur_offset;
+
+ total_len = rand_length(TEST_BUF_LEN);
+ offset = rand_offset(TEST_BUF_LEN - total_len);
+ rand_bytes(&test_buf[offset], total_len);
+
+ /* Compute the hash value in one shot. */
+ HASH(&test_buf[offset], total_len, hash1);
+
+ /*
+ * Compute the hash value incrementally, using a randomly
+ * selected sequence of update lengths that sum to total_len.
+ */
+ HASH_INIT(&ctx);
+ remaining_len = total_len;
+ cur_offset = offset;
+ while (rand_bool()) {
+ size_t part_len = rand_length(remaining_len);
+
+ HASH_UPDATE(&ctx, &test_buf[cur_offset], part_len);
+ num_parts++;
+ cur_offset += part_len;
+ remaining_len -= part_len;
+ }
+ if (remaining_len != 0 || rand_bool()) {
+ HASH_UPDATE(&ctx, &test_buf[cur_offset], remaining_len);
+ num_parts++;
+ }
+ HASH_FINAL(&ctx, hash2);
+
+ /* Verify that the two hash values are the same. */
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash1, hash2, HASH_SIZE,
+ "Incremental test failed with total_len=%zu num_parts=%zu offset=%zu",
+ total_len, num_parts, offset);
+ }
+}
+
+/*
+ * Test that the hash function does not overrun any buffers. Uses a guard page
+ * to catch buffer overruns even if they occur in assembly code.
+ */
+static void test_hash_buffer_overruns(struct kunit *test)
+{
+ const size_t max_tested_len = TEST_BUF_LEN - sizeof(struct HASH_CTX);
+ void *const buf_end = &test_buf[TEST_BUF_LEN];
+ struct HASH_CTX *guarded_ctx = buf_end - sizeof(*guarded_ctx);
+
+ rand_bytes(test_buf, TEST_BUF_LEN);
+
+ for (int i = 0; i < 100; i++) {
+ size_t len = rand_length(max_tested_len);
+ struct HASH_CTX ctx;
+ u8 hash[HASH_SIZE];
+
+ /* Check for overruns of the data buffer. */
+ HASH(buf_end - len, len, hash);
+ HASH_INIT(&ctx);
+ HASH_UPDATE(&ctx, buf_end - len, len);
+ HASH_FINAL(&ctx, hash);
+
+ /* Check for overruns of the hash value buffer. */
+ HASH(test_buf, len, buf_end - HASH_SIZE);
+ HASH_INIT(&ctx);
+ HASH_UPDATE(&ctx, test_buf, len);
+ HASH_FINAL(&ctx, buf_end - HASH_SIZE);
+
+ /* Check for overuns of the hash context. */
+ HASH_INIT(guarded_ctx);
+ HASH_UPDATE(guarded_ctx, test_buf, len);
+ HASH_FINAL(guarded_ctx, hash);
+ }
+}
+
+/*
+ * Test that the caller is permitted to alias the output digest and source data
+ * buffer, and also modify the source data buffer after it has been used.
+ */
+static void test_hash_overlaps(struct kunit *test)
+{
+ const size_t max_tested_len = TEST_BUF_LEN - HASH_SIZE;
+ struct HASH_CTX ctx;
+ u8 hash[HASH_SIZE];
+
+ rand_bytes(test_buf, TEST_BUF_LEN);
+
+ for (int i = 0; i < 100; i++) {
+ size_t len = rand_length(max_tested_len);
+ size_t offset = HASH_SIZE + rand_offset(max_tested_len - len);
+ bool left_end = rand_bool();
+ u8 *ovl_hash = left_end ? &test_buf[offset] :
+ &test_buf[offset + len - HASH_SIZE];
+
+ HASH(&test_buf[offset], len, hash);
+ HASH(&test_buf[offset], len, ovl_hash);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash, ovl_hash, HASH_SIZE,
+ "Overlap test 1 failed with len=%zu offset=%zu left_end=%d",
+ len, offset, left_end);
+
+ /* Repeat the above test, but this time use init+update+final */
+ HASH(&test_buf[offset], len, hash);
+ HASH_INIT(&ctx);
+ HASH_UPDATE(&ctx, &test_buf[offset], len);
+ HASH_FINAL(&ctx, ovl_hash);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash, ovl_hash, HASH_SIZE,
+ "Overlap test 2 failed with len=%zu offset=%zu left_end=%d",
+ len, offset, left_end);
+
+ /* Test modifying the source data after it was used. */
+ HASH(&test_buf[offset], len, hash);
+ HASH_INIT(&ctx);
+ HASH_UPDATE(&ctx, &test_buf[offset], len);
+ rand_bytes(&test_buf[offset], len);
+ HASH_FINAL(&ctx, ovl_hash);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash, ovl_hash, HASH_SIZE,
+ "Overlap test 3 failed with len=%zu offset=%zu left_end=%d",
+ len, offset, left_end);
+ }
+}
+
+/*
+ * Test that if the same data is hashed at different alignments in memory, the
+ * results are the same.
+ */
+static void test_hash_alignment_consistency(struct kunit *test)
+{
+ u8 hash1[128 + HASH_SIZE];
+ u8 hash2[128 + HASH_SIZE];
+
+ for (int i = 0; i < 100; i++) {
+ size_t len = rand_length(TEST_BUF_LEN);
+ size_t data_offs1 = rand_offset(TEST_BUF_LEN - len);
+ size_t data_offs2 = rand_offset(TEST_BUF_LEN - len);
+ size_t hash_offs1 = rand_offset(128);
+ size_t hash_offs2 = rand_offset(128);
+
+ rand_bytes(&test_buf[data_offs1], len);
+ HASH(&test_buf[data_offs1], len, &hash1[hash_offs1]);
+ memmove(&test_buf[data_offs2], &test_buf[data_offs1], len);
+ HASH(&test_buf[data_offs2], len, &hash2[hash_offs2]);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, &hash1[hash_offs1], &hash2[hash_offs2], HASH_SIZE,
+ "Alignment consistency test failed with len=%zu data_offs=(%zu,%zu) hash_offs=(%zu,%zu)",
+ len, data_offs1, data_offs2, hash_offs1, hash_offs2);
+ }
+}
+
+/* Test that HASH_FINAL zeroizes the context. */
+static void test_hash_ctx_zeroization(struct kunit *test)
+{
+ static const u8 zeroes[sizeof(struct HASH_CTX)];
+ struct HASH_CTX ctx;
+
+ rand_bytes(test_buf, 128);
+ HASH_INIT(&ctx);
+ HASH_UPDATE(&ctx, test_buf, 128);
+ HASH_FINAL(&ctx, test_buf);
+ KUNIT_ASSERT_MEMEQ_MSG(test, &ctx, zeroes, sizeof(ctx),
+ "Hash context was not zeroized by finalization");
+}
+
+#define IRQ_TEST_DATA_LEN 256
+#define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */
+
+struct hash_irq_test1_state {
+ u8 expected_hashes[IRQ_TEST_NUM_BUFFERS][HASH_SIZE];
+ atomic_t seqno;
+};
+
+/*
+ * Compute the hash of one of the test messages and verify that it matches the
+ * expected hash from @state->expected_hashes. To increase the chance of
+ * detecting problems, cycle through multiple messages.
+ */
+static bool hash_irq_test1_func(void *state_)
+{
+ struct hash_irq_test1_state *state = state_;
+ u32 i = (u32)atomic_inc_return(&state->seqno) % IRQ_TEST_NUM_BUFFERS;
+ u8 actual_hash[HASH_SIZE];
+
+ HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN, actual_hash);
+ return memcmp(actual_hash, state->expected_hashes[i], HASH_SIZE) == 0;
+}
+
+/*
+ * Test that if hashes are computed in task, softirq, and hardirq context
+ * concurrently, then all results are as expected.
+ */
+static void test_hash_interrupt_context_1(struct kunit *test)
+{
+ struct hash_irq_test1_state state = {};
+
+ /* Prepare some test messages and compute the expected hash of each. */
+ rand_bytes(test_buf, IRQ_TEST_NUM_BUFFERS * IRQ_TEST_DATA_LEN);
+ for (int i = 0; i < IRQ_TEST_NUM_BUFFERS; i++)
+ HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN,
+ state.expected_hashes[i]);
+
+ kunit_run_irq_test(test, hash_irq_test1_func, 100000, &state);
+}
+
+struct hash_irq_test2_hash_ctx {
+ struct HASH_CTX hash_ctx;
+ atomic_t in_use;
+ int offset;
+ int step;
+};
+
+struct hash_irq_test2_state {
+ struct hash_irq_test2_hash_ctx ctxs[IRQ_TEST_NUM_BUFFERS];
+ u8 expected_hash[HASH_SIZE];
+ u16 update_lens[32];
+ int num_steps;
+};
+
+static bool hash_irq_test2_func(void *state_)
+{
+ struct hash_irq_test2_state *state = state_;
+ struct hash_irq_test2_hash_ctx *ctx;
+ bool ret = true;
+
+ for (ctx = &state->ctxs[0]; ctx < &state->ctxs[ARRAY_SIZE(state->ctxs)];
+ ctx++) {
+ if (atomic_cmpxchg(&ctx->in_use, 0, 1) == 0)
+ break;
+ }
+ if (WARN_ON_ONCE(ctx == &state->ctxs[ARRAY_SIZE(state->ctxs)])) {
+ /*
+ * This should never happen, as the number of contexts is equal
+ * to the maximum concurrency level of kunit_run_irq_test().
+ */
+ return false;
+ }
+
+ if (ctx->step == 0) {
+ /* Init step */
+ HASH_INIT(&ctx->hash_ctx);
+ ctx->offset = 0;
+ ctx->step++;
+ } else if (ctx->step < state->num_steps - 1) {
+ /* Update step */
+ HASH_UPDATE(&ctx->hash_ctx, &test_buf[ctx->offset],
+ state->update_lens[ctx->step - 1]);
+ ctx->offset += state->update_lens[ctx->step - 1];
+ ctx->step++;
+ } else {
+ /* Final step */
+ u8 actual_hash[HASH_SIZE];
+
+ if (WARN_ON_ONCE(ctx->offset != TEST_BUF_LEN))
+ ret = false;
+ HASH_FINAL(&ctx->hash_ctx, actual_hash);
+ if (memcmp(actual_hash, state->expected_hash, HASH_SIZE) != 0)
+ ret = false;
+ ctx->step = 0;
+ }
+ atomic_set_release(&ctx->in_use, 0);
+ return ret;
+}
+
+/*
+ * Test that if hashes are computed in task, softirq, and hardirq context
+ * concurrently, *including doing different parts of the same incremental
+ * computation in different contexts*, then all results are as expected.
+ * Besides detecting bugs similar to those that test_hash_interrupt_context_1
+ * can detect, this test case can also detect bugs where hash function
+ * implementations don't correctly handle these mixed incremental computations.
+ */
+static void test_hash_interrupt_context_2(struct kunit *test)
+{
+ struct hash_irq_test2_state *state;
+ int remaining = TEST_BUF_LEN;
+
+ state = kunit_kzalloc(test, sizeof(*state), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, state);
+
+ rand_bytes(test_buf, TEST_BUF_LEN);
+ HASH(test_buf, TEST_BUF_LEN, state->expected_hash);
+
+ /*
+ * Generate a list of update lengths to use. Ensure that it contains
+ * multiple entries but is limited to a maximum length.
+ */
+ static_assert(TEST_BUF_LEN / 4096 > 1);
+ for (state->num_steps = 0;
+ state->num_steps < ARRAY_SIZE(state->update_lens) - 1 && remaining;
+ state->num_steps++) {
+ state->update_lens[state->num_steps] =
+ rand_length(min(remaining, 4096));
+ remaining -= state->update_lens[state->num_steps];
+ }
+ if (remaining)
+ state->update_lens[state->num_steps++] = remaining;
+ state->num_steps += 2; /* for init and final */
+
+ kunit_run_irq_test(test, hash_irq_test2_func, 250000, state);
+}
+
+#define UNKEYED_HASH_KUNIT_CASES \
+ KUNIT_CASE(test_hash_test_vectors), \
+ KUNIT_CASE(test_hash_all_lens_up_to_4096), \
+ KUNIT_CASE(test_hash_incremental_updates), \
+ KUNIT_CASE(test_hash_buffer_overruns), \
+ KUNIT_CASE(test_hash_overlaps), \
+ KUNIT_CASE(test_hash_alignment_consistency), \
+ KUNIT_CASE(test_hash_ctx_zeroization), \
+ KUNIT_CASE(test_hash_interrupt_context_1), \
+ KUNIT_CASE(test_hash_interrupt_context_2)
+/* benchmark_hash is omitted so that the suites can put it last. */
+
+#ifdef HMAC
+/*
+ * Test the corresponding HMAC variant.
+ *
+ * This test case is fairly short, since HMAC is just a simple C wrapper around
+ * the underlying unkeyed hash function, which is already well-tested by the
+ * other test cases. It's not useful to test things like data alignment or
+ * interrupt context again for HMAC, nor to have a long list of test vectors.
+ *
+ * Thus, just do a single consolidated test, which covers all data lengths up to
+ * 4096 bytes and all key lengths up to 292 bytes. For each data length, select
+ * a key length, generate the inputs from a seed, and compute the HMAC value.
+ * Concatenate all these HMAC values together, and compute the HMAC of that.
+ * Verify that value. If this fails, then the HMAC implementation is wrong.
+ * This won't show which specific input failed, but that should be fine. Any
+ * failure would likely be non-input-specific or also show in the unkeyed tests.
+ */
+static void test_hmac(struct kunit *test)
+{
+ static const u8 zeroes[sizeof(struct HMAC_CTX)];
+ u8 *raw_key;
+ struct HMAC_KEY key;
+ struct HMAC_CTX ctx;
+ u8 mac[HASH_SIZE];
+ u8 mac2[HASH_SIZE];
+
+ static_assert(TEST_BUF_LEN >= 4096 + 293);
+ rand_bytes_seeded_from_len(test_buf, 4096);
+ raw_key = &test_buf[4096];
+
+ rand_bytes_seeded_from_len(raw_key, 32);
+ HMAC_PREPAREKEY(&key, raw_key, 32);
+ HMAC_INIT(&ctx, &key);
+ for (size_t data_len = 0; data_len <= 4096; data_len++) {
+ /*
+ * Cycle through key lengths as well. Somewhat arbitrarily go
+ * up to 293, which is somewhat larger than the largest hash
+ * block size (which is the size at which the key starts being
+ * hashed down to one block); going higher would not be useful.
+ * To reduce correlation with data_len, use a prime number here.
+ */
+ size_t key_len = data_len % 293;
+
+ HMAC_UPDATE(&ctx, test_buf, data_len);
+
+ rand_bytes_seeded_from_len(raw_key, key_len);
+ HMAC_USINGRAWKEY(raw_key, key_len, test_buf, data_len, mac);
+ HMAC_UPDATE(&ctx, mac, HASH_SIZE);
+
+ /* Verify that HMAC() is consistent with HMAC_USINGRAWKEY(). */
+ HMAC_PREPAREKEY(&key, raw_key, key_len);
+ HMAC(&key, test_buf, data_len, mac2);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, mac, mac2, HASH_SIZE,
+ "HMAC gave different results with raw and prepared keys");
+ }
+ HMAC_FINAL(&ctx, mac);
+ KUNIT_EXPECT_MEMEQ_MSG(test, mac, hmac_testvec_consolidated, HASH_SIZE,
+ "HMAC gave wrong result");
+ KUNIT_EXPECT_MEMEQ_MSG(test, &ctx, zeroes, sizeof(ctx),
+ "HMAC context was not zeroized by finalization");
+}
+#define HASH_KUNIT_CASES UNKEYED_HASH_KUNIT_CASES, KUNIT_CASE(test_hmac)
+#else
+#define HASH_KUNIT_CASES UNKEYED_HASH_KUNIT_CASES
+#endif
+
+/* Benchmark the hash function on various data lengths. */
+static void benchmark_hash(struct kunit *test)
+{
+ static const size_t lens_to_test[] = {
+ 1, 16, 64, 127, 128, 200, 256,
+ 511, 512, 1024, 3173, 4096, 16384,
+ };
+ u8 hash[HASH_SIZE];
+
+ if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+ kunit_skip(test, "not enabled");
+
+ /* Warm-up */
+ for (size_t i = 0; i < 10000000; i += TEST_BUF_LEN)
+ HASH(test_buf, TEST_BUF_LEN, hash);
+
+ for (size_t i = 0; i < ARRAY_SIZE(lens_to_test); i++) {
+ size_t len = lens_to_test[i];
+ /* The '+ 128' tries to account for per-message overhead. */
+ size_t num_iters = 10000000 / (len + 128);
+ u64 t;
+
+ KUNIT_ASSERT_LE(test, len, TEST_BUF_LEN);
+ preempt_disable();
+ t = ktime_get_ns();
+ for (size_t j = 0; j < num_iters; j++)
+ HASH(test_buf, len, hash);
+ t = ktime_get_ns() - t;
+ preempt_enable();
+ kunit_info(test, "len=%zu: %llu MB/s", len,
+ div64_u64((u64)len * num_iters * 1000, t ?: 1));
+ }
+}
diff --git a/lib/crypto/tests/md5-testvecs.h b/lib/crypto/tests/md5-testvecs.h
new file mode 100644
index 000000000000..be6727feb296
--- /dev/null
+++ b/lib/crypto/tests/md5-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py md5 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[MD5_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04,
+ 0xe9, 0x80, 0x09, 0x98, 0xec, 0xf8, 0x42, 0x7e,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x16, 0x7b, 0x86, 0xf2, 0x1d, 0xf3, 0x76, 0xc9,
+ 0x6f, 0x10, 0xa0, 0x61, 0x5b, 0x14, 0x20, 0x0b,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x2d, 0x30, 0x96, 0xc7, 0x43, 0x40, 0xed, 0xb2,
+ 0xfb, 0x84, 0x63, 0x9a, 0xec, 0xc7, 0x3c, 0x3c,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0xe5, 0x0f, 0xce, 0xe0, 0xc8, 0xff, 0x4e, 0x08,
+ 0x5e, 0x19, 0xe5, 0xf2, 0x08, 0x11, 0x19, 0x16,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0xe8, 0xca, 0x29, 0x05, 0x2f, 0xd1, 0xf3, 0x99,
+ 0x40, 0x71, 0xf5, 0xc2, 0xf7, 0xf8, 0x17, 0x3e,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0xe3, 0x20, 0xc1, 0xd8, 0x21, 0x14, 0x44, 0x59,
+ 0x1a, 0xf5, 0x91, 0xaf, 0x69, 0xbe, 0x93, 0x9d,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0xfb, 0x06, 0xb0, 0xf0, 0x00, 0x10, 0x4b, 0x68,
+ 0x3d, 0x75, 0xf9, 0x70, 0xde, 0xbb, 0x32, 0x16,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x52, 0x86, 0x48, 0x8b, 0xae, 0x91, 0x7c, 0x4e,
+ 0xc2, 0x2a, 0x69, 0x07, 0x35, 0xcc, 0xb2, 0x88,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0xfa, 0xd3, 0xf6, 0xe6, 0x7b, 0x1a, 0xc6, 0x05,
+ 0x73, 0x35, 0x02, 0xab, 0xc7, 0xb3, 0x47, 0xcb,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0xc5, 0x59, 0x29, 0xe9, 0x0a, 0x4a, 0x86, 0x43,
+ 0x7c, 0xaf, 0xdf, 0x83, 0xd3, 0xb8, 0x33, 0x5f,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x80, 0x05, 0x75, 0x39, 0xec, 0x44, 0x8a, 0x81,
+ 0xe7, 0x6e, 0x8d, 0xd1, 0xc6, 0xeb, 0xc2, 0xf0,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x3f, 0x02, 0xe8, 0xc6, 0xb8, 0x6a, 0x39, 0xc3,
+ 0xa4, 0x1c, 0xd9, 0x8f, 0x4a, 0x71, 0x40, 0x30,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x89, 0x4f, 0x79, 0x3e, 0xff, 0x0c, 0x22, 0x60,
+ 0xa2, 0xdc, 0x10, 0x5f, 0x23, 0x0a, 0xe7, 0xc6,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x06, 0x56, 0x61, 0xb8, 0x8a, 0x82, 0x77, 0x1b,
+ 0x2c, 0x35, 0xb8, 0x9f, 0xd6, 0xf7, 0xbd, 0x5a,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x5d, 0xdf, 0x7d, 0xc8, 0x43, 0x96, 0x3b, 0xdb,
+ 0xc7, 0x0e, 0x44, 0x42, 0x23, 0xf7, 0xed, 0xdf,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0xf6, 0x5f, 0x26, 0x51, 0x8a, 0x5a, 0x46, 0x8f,
+ 0x48, 0x72, 0x90, 0x74, 0x9d, 0x87, 0xbd, 0xdf,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0xd8, 0x2c, 0xc9, 0x76, 0xfa, 0x67, 0x2e, 0xa6,
+ 0xc8, 0x12, 0x4a, 0x64, 0xaa, 0x0b, 0x3d, 0xbd,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0xe2, 0x7e, 0xb4, 0x5f, 0xe1, 0x74, 0x51, 0xfc,
+ 0xe0, 0xc8, 0xd5, 0xe6, 0x8b, 0x40, 0xd2, 0x0e,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0xcd, 0x7d, 0x56, 0xa9, 0x4c, 0x47, 0xea, 0xc2,
+ 0x34, 0x0b, 0x84, 0x05, 0xf9, 0xad, 0xbb, 0x46,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x63, 0x6e, 0x58, 0xb3, 0x94, 0x6b, 0x83, 0x5f,
+ 0x1f, 0x0e, 0xd3, 0x66, 0x78, 0x71, 0x98, 0x42,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x9d, 0x68, 0xfc, 0x26, 0x8b, 0x4c, 0xa8, 0xe7,
+ 0x30, 0x0b, 0x19, 0x52, 0x6e, 0xa5, 0x65, 0x1c,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x1c, 0xaa, 0x7d, 0xee, 0x91, 0x01, 0xe2, 0x5a,
+ 0xec, 0xe9, 0xde, 0x57, 0x0a, 0xb6, 0x4c, 0x2f,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x1b, 0x31, 0xe3, 0x14, 0x07, 0x16, 0x17, 0xc6,
+ 0x98, 0x79, 0x88, 0x23, 0xb6, 0x3b, 0x25, 0xc4,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xc6, 0x3d, 0x56, 0x90, 0xf0, 0xf6, 0xe6, 0x50,
+ 0xf4, 0x76, 0x78, 0x67, 0xa3, 0xdd, 0x62, 0x7b,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[MD5_DIGEST_SIZE] = {
+ 0x70, 0x86, 0x9e, 0x6c, 0xa4, 0xc6, 0x71, 0x43,
+ 0x26, 0x02, 0x1b, 0x3f, 0xfd, 0x56, 0x9f, 0xa6,
+};
+
+static const u8 hmac_testvec_consolidated[MD5_DIGEST_SIZE] = {
+ 0x10, 0x02, 0x74, 0xf6, 0x4d, 0xb3, 0x3c, 0xc7,
+ 0xa1, 0xf7, 0xe6, 0xd4, 0x32, 0x64, 0xfa, 0x6d,
+};
diff --git a/lib/crypto/tests/md5_kunit.c b/lib/crypto/tests/md5_kunit.c
new file mode 100644
index 000000000000..38bd52c25ae3
--- /dev/null
+++ b/lib/crypto/tests/md5_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/md5.h>
+#include "md5-testvecs.h"
+
+#define HASH md5
+#define HASH_CTX md5_ctx
+#define HASH_SIZE MD5_DIGEST_SIZE
+#define HASH_INIT md5_init
+#define HASH_UPDATE md5_update
+#define HASH_FINAL md5_final
+#define HMAC_KEY hmac_md5_key
+#define HMAC_CTX hmac_md5_ctx
+#define HMAC_PREPAREKEY hmac_md5_preparekey
+#define HMAC_INIT hmac_md5_init
+#define HMAC_UPDATE hmac_md5_update
+#define HMAC_FINAL hmac_md5_final
+#define HMAC hmac_md5
+#define HMAC_USINGRAWKEY hmac_md5_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "md5",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for MD5 and HMAC-MD5");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/poly1305-testvecs.h b/lib/crypto/tests/poly1305-testvecs.h
new file mode 100644
index 000000000000..ecf8662225c8
--- /dev/null
+++ b/lib/crypto/tests/poly1305-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py poly1305 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[POLY1305_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xe8, 0x2d, 0x67, 0x2c, 0x01, 0x48, 0xf9, 0xb7,
+ 0x87, 0x85, 0x3f, 0xcf, 0x18, 0x66, 0x8c, 0xd3,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0xb8, 0xad, 0xca, 0x6b, 0x32, 0xba, 0x34, 0x42,
+ 0x54, 0x10, 0x28, 0xf5, 0x0f, 0x7e, 0x8e, 0xe3,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0xb8, 0xf7, 0xf4, 0xc2, 0x85, 0x33, 0x80, 0x63,
+ 0xd1, 0x45, 0xda, 0xf8, 0x7c, 0x79, 0x42, 0xd1,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0xf3, 0x73, 0x7b, 0x60, 0x24, 0xcc, 0x5d, 0x3e,
+ 0xd1, 0x95, 0x86, 0xce, 0x89, 0x0a, 0x33, 0xba,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x0a, 0x1a, 0x2d, 0x39, 0xea, 0x49, 0x8f, 0xb7,
+ 0x90, 0xb6, 0x74, 0x3b, 0x41, 0x3b, 0x96, 0x11,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x99, 0x05, 0xe3, 0xa7, 0x9e, 0x2a, 0xd2, 0x42,
+ 0xb9, 0x45, 0x0c, 0x08, 0xe7, 0x10, 0xe4, 0xe1,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0xe1, 0xb2, 0x15, 0xee, 0xa2, 0xf3, 0x04, 0xac,
+ 0xdd, 0x27, 0x57, 0x95, 0x2f, 0x45, 0xa8, 0xd3,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x1c, 0xf3, 0xab, 0x39, 0xc0, 0x69, 0x49, 0x69,
+ 0x89, 0x6f, 0x1f, 0x03, 0x16, 0xe7, 0xc0, 0xf0,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x30, 0xb0, 0x32, 0x87, 0x51, 0x55, 0x9c, 0x39,
+ 0x38, 0x42, 0x06, 0xe9, 0x2a, 0x3e, 0x2c, 0x92,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x2c, 0x04, 0x16, 0x36, 0x55, 0x25, 0x2d, 0xc6,
+ 0x3d, 0x70, 0x5b, 0x88, 0x46, 0xb6, 0x71, 0x77,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x03, 0x87, 0xdd, 0xbe, 0xe8, 0x30, 0xf2, 0x15,
+ 0x40, 0x44, 0x29, 0x7b, 0xb1, 0xe9, 0x9d, 0xe7,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x29, 0x83, 0x4f, 0xcb, 0x5a, 0x93, 0x25, 0xad,
+ 0x05, 0xa4, 0xb3, 0x24, 0x77, 0x62, 0x2d, 0x3d,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x20, 0x0e, 0x2c, 0x05, 0xe2, 0x0b, 0x85, 0xa0,
+ 0x24, 0x73, 0x7f, 0x65, 0x70, 0x6c, 0x3e, 0xb0,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0xef, 0x2f, 0x98, 0x42, 0xc2, 0x90, 0x55, 0xea,
+ 0xba, 0x28, 0x76, 0xfd, 0x9e, 0x3e, 0x4d, 0x53,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x9e, 0x75, 0x4b, 0xc7, 0x69, 0x68, 0x51, 0x90,
+ 0xdc, 0x29, 0xc8, 0xfa, 0x86, 0xf1, 0xc9, 0xb3,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x9d, 0x13, 0xf5, 0x54, 0xe6, 0xe3, 0x45, 0x38,
+ 0x8b, 0x6d, 0x5c, 0xc4, 0x50, 0xeb, 0x90, 0xcb,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0xaa, 0xb2, 0x3e, 0x3c, 0x2a, 0xfc, 0x62, 0x0e,
+ 0xd4, 0xe6, 0xe5, 0x5c, 0x6b, 0x9f, 0x3d, 0xc7,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0xd6, 0x8c, 0xea, 0x8a, 0x0f, 0x68, 0xa9, 0xa8,
+ 0x67, 0x86, 0xf9, 0xc1, 0x4c, 0x26, 0x10, 0x6d,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0xdc, 0xc1, 0x54, 0xe7, 0x38, 0xc6, 0xdb, 0x24,
+ 0xa7, 0x0b, 0xff, 0xd3, 0x1b, 0x93, 0x01, 0xa6,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x8f, 0x88, 0x3e, 0x9c, 0x7b, 0x2e, 0x82, 0x5a,
+ 0x1d, 0x31, 0x82, 0xcc, 0x69, 0xb4, 0x16, 0x26,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x23, 0x45, 0x94, 0xa8, 0x11, 0x54, 0x9d, 0xf2,
+ 0xa1, 0x9a, 0xca, 0xf9, 0x3e, 0x65, 0x52, 0xfd,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x7b, 0xfc, 0xa9, 0x1e, 0x03, 0xad, 0xef, 0x03,
+ 0xe2, 0x20, 0x92, 0xc7, 0x54, 0x83, 0xfa, 0x37,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x46, 0xab, 0x8c, 0x75, 0xb3, 0x10, 0xa6, 0x3f,
+ 0x74, 0x55, 0x42, 0x6d, 0x69, 0x35, 0xd2, 0xf5,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xd0, 0xfe, 0x26, 0xc2, 0xca, 0x94, 0x2d, 0x52,
+ 0x2d, 0xe1, 0x11, 0xdd, 0x42, 0x28, 0x83, 0xa6,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[POLY1305_DIGEST_SIZE] = {
+ 0x9d, 0x07, 0x5d, 0xc9, 0x6c, 0xeb, 0x62, 0x5d,
+ 0x02, 0x5f, 0xe1, 0xe3, 0xd1, 0x71, 0x69, 0x34,
+};
+
+static const u8 poly1305_allones_macofmacs[POLY1305_DIGEST_SIZE] = {
+ 0x0c, 0x26, 0x6b, 0x45, 0x87, 0x06, 0xcf, 0xc4,
+ 0x3f, 0x70, 0x7d, 0xb3, 0x50, 0xdd, 0x81, 0x25,
+};
diff --git a/lib/crypto/tests/poly1305_kunit.c b/lib/crypto/tests/poly1305_kunit.c
new file mode 100644
index 000000000000..7ac191bd96b6
--- /dev/null
+++ b/lib/crypto/tests/poly1305_kunit.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/poly1305.h>
+#include "poly1305-testvecs.h"
+
+/*
+ * A fixed key used when presenting Poly1305 as an unkeyed hash function in
+ * order to reuse hash-test-template.h. At the beginning of the test suite,
+ * this is initialized to bytes generated from a fixed seed.
+ */
+static u8 test_key[POLY1305_KEY_SIZE];
+
+/* This probably should be in the actual API, but just define it here for now */
+static void poly1305(const u8 key[POLY1305_KEY_SIZE], const u8 *data,
+ size_t len, u8 out[POLY1305_DIGEST_SIZE])
+{
+ struct poly1305_desc_ctx ctx;
+
+ poly1305_init(&ctx, key);
+ poly1305_update(&ctx, data, len);
+ poly1305_final(&ctx, out);
+}
+
+static void poly1305_init_withtestkey(struct poly1305_desc_ctx *ctx)
+{
+ poly1305_init(ctx, test_key);
+}
+
+static void poly1305_withtestkey(const u8 *data, size_t len,
+ u8 out[POLY1305_DIGEST_SIZE])
+{
+ poly1305(test_key, data, len, out);
+}
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH poly1305_withtestkey
+#define HASH_CTX poly1305_desc_ctx
+#define HASH_SIZE POLY1305_DIGEST_SIZE
+#define HASH_INIT poly1305_init_withtestkey
+#define HASH_UPDATE poly1305_update
+#define HASH_FINAL poly1305_final
+#include "hash-test-template.h"
+
+static int poly1305_suite_init(struct kunit_suite *suite)
+{
+ rand_bytes_seeded_from_len(test_key, POLY1305_KEY_SIZE);
+ return hash_suite_init(suite);
+}
+
+static void poly1305_suite_exit(struct kunit_suite *suite)
+{
+ hash_suite_exit(suite);
+}
+
+/*
+ * Poly1305 test case which uses a key and message consisting only of one bits:
+ *
+ * - Using an all-one-bits r_key tests the key clamping.
+ * - Using an all-one-bits s_key tests carries in implementations of the
+ * addition mod 2**128 during finalization.
+ * - Using all-one-bits message, and to a lesser extent r_key, tends to maximize
+ * any intermediate accumulator values. This increases the chance of
+ * detecting bugs that occur only in rare cases where the accumulator values
+ * get very large, for example the bug fixed by commit 678cce4019d746da
+ * ("crypto: x86/poly1305 - fix overflow during partial reduction").
+ *
+ * Accumulator overflow bugs may be specific to particular update lengths (in
+ * blocks) and/or particular values of the previous acculumator. Note that the
+ * accumulator starts at 0 which gives the lowest chance of an overflow. Thus,
+ * a single all-one-bits test vector may be insufficient.
+ *
+ * Considering that, do the following test: continuously update a single
+ * Poly1305 context with all-one-bits data of varying lengths (0, 16, 32, ...,
+ * 4096 bytes). After each update, generate the MAC from the current context,
+ * and feed that MAC into a separate Poly1305 context. Repeat that entire
+ * sequence of updates 32 times without re-initializing either context,
+ * resulting in a total of 8224 MAC computations from a long-running, cumulative
+ * context. Finally, generate and verify the MAC of all the MACs.
+ */
+static void test_poly1305_allones_keys_and_message(struct kunit *test)
+{
+ struct poly1305_desc_ctx mac_ctx, macofmacs_ctx;
+ u8 mac[POLY1305_DIGEST_SIZE];
+
+ static_assert(TEST_BUF_LEN >= 4096);
+ memset(test_buf, 0xff, 4096);
+
+ poly1305_init(&mac_ctx, test_buf);
+ poly1305_init(&macofmacs_ctx, test_buf);
+ for (int i = 0; i < 32; i++) {
+ for (size_t len = 0; len <= 4096; len += 16) {
+ struct poly1305_desc_ctx tmp_ctx;
+
+ poly1305_update(&mac_ctx, test_buf, len);
+ tmp_ctx = mac_ctx;
+ poly1305_final(&tmp_ctx, mac);
+ poly1305_update(&macofmacs_ctx, mac,
+ POLY1305_DIGEST_SIZE);
+ }
+ }
+ poly1305_final(&macofmacs_ctx, mac);
+ KUNIT_ASSERT_MEMEQ(test, mac, poly1305_allones_macofmacs,
+ POLY1305_DIGEST_SIZE);
+}
+
+/*
+ * Poly1305 test case which uses r_key=1, s_key=0, and a 48-byte message
+ * consisting of three blocks with integer values [2**128 - i, 0, 0]. In this
+ * case, the result of the polynomial evaluation is 2**130 - i. For small
+ * values of i, this is very close to the modulus 2**130 - 5, which helps catch
+ * edge case bugs in the modular reduction logic.
+ */
+static void test_poly1305_reduction_edge_cases(struct kunit *test)
+{
+ static const u8 key[POLY1305_KEY_SIZE] = { 1 }; /* r_key=1, s_key=0 */
+ u8 data[3 * POLY1305_BLOCK_SIZE] = {};
+ u8 expected_mac[POLY1305_DIGEST_SIZE];
+ u8 actual_mac[POLY1305_DIGEST_SIZE];
+
+ for (int i = 1; i <= 10; i++) {
+ /* Set the first data block to 2**128 - i. */
+ data[0] = -i;
+ memset(&data[1], 0xff, POLY1305_BLOCK_SIZE - 1);
+
+ /*
+ * Assuming s_key=0, the expected MAC as an integer is
+ * (2**130 - i mod 2**130 - 5) + 0 mod 2**128. If 1 <= i <= 5,
+ * that's 5 - i. If 6 <= i <= 10, that's 2**128 - i.
+ */
+ if (i <= 5) {
+ expected_mac[0] = 5 - i;
+ memset(&expected_mac[1], 0, POLY1305_DIGEST_SIZE - 1);
+ } else {
+ expected_mac[0] = -i;
+ memset(&expected_mac[1], 0xff,
+ POLY1305_DIGEST_SIZE - 1);
+ }
+
+ /* Compute and verify the MAC. */
+ poly1305(key, data, sizeof(data), actual_mac);
+ KUNIT_ASSERT_MEMEQ(test, actual_mac, expected_mac,
+ POLY1305_DIGEST_SIZE);
+ }
+}
+
+static struct kunit_case poly1305_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_poly1305_allones_keys_and_message),
+ KUNIT_CASE(test_poly1305_reduction_edge_cases),
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite poly1305_test_suite = {
+ .name = "poly1305",
+ .test_cases = poly1305_test_cases,
+ .suite_init = poly1305_suite_init,
+ .suite_exit = poly1305_suite_exit,
+};
+kunit_test_suite(poly1305_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for Poly1305");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/polyval-testvecs.h b/lib/crypto/tests/polyval-testvecs.h
new file mode 100644
index 000000000000..3d33f60d58bb
--- /dev/null
+++ b/lib/crypto/tests/polyval-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py polyval */
+
+static const struct {
+ size_t data_len;
+ u8 digest[POLYVAL_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0xb5, 0x51, 0x69, 0x89, 0xd4, 0x3c, 0x59, 0xca,
+ 0x6a, 0x1c, 0x2a, 0xe9, 0xa1, 0x9c, 0x6c, 0x83,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0xf4, 0x50, 0xaf, 0x07, 0xda, 0x42, 0xa7, 0x41,
+ 0x4d, 0x24, 0x88, 0x87, 0xe3, 0x40, 0x73, 0x7c,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x9e, 0x88, 0x78, 0x71, 0x4c, 0x55, 0x87, 0xe8,
+ 0xb4, 0x96, 0x3d, 0x56, 0xc8, 0xb2, 0xe1, 0x68,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x9e, 0x81, 0x37, 0x8f, 0x49, 0xf7, 0xa2, 0xe4,
+ 0x04, 0x45, 0x12, 0x78, 0x45, 0x42, 0x27, 0xad,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x60, 0x19, 0xd0, 0xa4, 0xf0, 0xde, 0x9e, 0xe7,
+ 0x6a, 0x89, 0x1a, 0xea, 0x80, 0x14, 0xa9, 0xa3,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x0c, 0xa2, 0x70, 0x4d, 0x7c, 0x89, 0xac, 0x41,
+ 0xc2, 0x9e, 0x0d, 0x07, 0x07, 0x6a, 0x7f, 0xd5,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x91, 0xd3, 0xa9, 0x5c, 0x79, 0x3d, 0x6b, 0x84,
+ 0x99, 0x54, 0xa7, 0xb4, 0x06, 0x66, 0xfd, 0x1c,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x29, 0x37, 0xb8, 0xe5, 0xd8, 0x27, 0x4d, 0xfb,
+ 0x83, 0x4f, 0x67, 0xf7, 0xf9, 0xc1, 0x0a, 0x9d,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x17, 0xa9, 0x06, 0x2c, 0xf3, 0xe8, 0x2e, 0xa6,
+ 0x6b, 0xb2, 0x1f, 0x5d, 0x94, 0x3c, 0x02, 0xa2,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x7c, 0x80, 0x74, 0xd7, 0xa1, 0x37, 0x30, 0x64,
+ 0x3b, 0xa4, 0xa3, 0x98, 0xde, 0x47, 0x10, 0x23,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x27, 0x3a, 0xcf, 0xf5, 0xaf, 0x9f, 0xd8, 0xd8,
+ 0x2d, 0x6a, 0x91, 0xfb, 0xb8, 0xfa, 0xbe, 0x0c,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x97, 0x6e, 0xc4, 0xbe, 0x6b, 0x15, 0xa6, 0x7c,
+ 0xc4, 0xa2, 0xb8, 0x0a, 0x0e, 0x9c, 0xc7, 0x3a,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x2b, 0xc3, 0x98, 0xba, 0x6e, 0x42, 0xf8, 0x18,
+ 0x85, 0x69, 0x15, 0x37, 0x10, 0x60, 0xe6, 0xac,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x88, 0x21, 0x77, 0x89, 0xd7, 0x93, 0x90, 0xfc,
+ 0xf3, 0xb0, 0xe3, 0xfb, 0x14, 0xe2, 0xcf, 0x74,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x66, 0x3d, 0x3e, 0x08, 0xa0, 0x49, 0x81, 0x68,
+ 0x3e, 0x3b, 0xc8, 0x80, 0x55, 0xd4, 0x15, 0xe9,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x05, 0xf5, 0x06, 0x66, 0xe7, 0x11, 0x08, 0x84,
+ 0xff, 0x94, 0x50, 0x85, 0x65, 0x95, 0x2a, 0x20,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0xd3, 0xa0, 0x51, 0x69, 0xb5, 0x38, 0xae, 0x1b,
+ 0xe1, 0xa2, 0x89, 0xc6, 0x8d, 0x2b, 0x62, 0x37,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x37, 0x6d, 0x6a, 0x14, 0xdc, 0xa5, 0x37, 0xfc,
+ 0xfe, 0x67, 0x76, 0xb2, 0x64, 0x68, 0x64, 0x05,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0xe3, 0x12, 0x0c, 0x58, 0x46, 0x45, 0x27, 0x7a,
+ 0x0e, 0xa2, 0xfa, 0x2c, 0x35, 0x73, 0x6c, 0x94,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x63, 0x0d, 0xa1, 0xbc, 0x6e, 0x3e, 0xd3, 0x1d,
+ 0x28, 0x52, 0xd2, 0xf4, 0x30, 0x2d, 0xff, 0xc4,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0xb2, 0x91, 0x49, 0xe2, 0x02, 0x98, 0x00, 0x79,
+ 0x71, 0xb9, 0xd7, 0xd4, 0xb5, 0x94, 0x6d, 0x7d,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x58, 0x96, 0x48, 0x69, 0x05, 0x17, 0xe1, 0x6d,
+ 0xbc, 0xf2, 0x3d, 0x10, 0x96, 0x00, 0x74, 0x58,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0x99, 0x3c, 0xcb, 0x4d, 0x64, 0xc9, 0xa9, 0x41,
+ 0x52, 0x93, 0xfd, 0x65, 0xc4, 0xcc, 0xa5, 0xe5,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[POLYVAL_DIGEST_SIZE] = {
+ 0xdf, 0x68, 0x52, 0x99, 0x92, 0xc3, 0xe8, 0x88,
+ 0x29, 0x13, 0xc8, 0x35, 0x67, 0xa3, 0xd3, 0xad,
+};
+
+static const u8 polyval_allones_hashofhashes[POLYVAL_DIGEST_SIZE] = {
+ 0xd5, 0xf7, 0xfd, 0xb2, 0xa6, 0xef, 0x0b, 0x85,
+ 0x0d, 0x0a, 0x06, 0x10, 0xbc, 0x64, 0x94, 0x73,
+};
diff --git a/lib/crypto/tests/polyval_kunit.c b/lib/crypto/tests/polyval_kunit.c
new file mode 100644
index 000000000000..e59f598c1572
--- /dev/null
+++ b/lib/crypto/tests/polyval_kunit.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/polyval.h>
+#include "polyval-testvecs.h"
+
+/*
+ * A fixed key used when presenting POLYVAL as an unkeyed hash function in order
+ * to reuse hash-test-template.h. At the beginning of the test suite, this is
+ * initialized to a key prepared from bytes generated from a fixed seed.
+ */
+static struct polyval_key test_key;
+
+static void polyval_init_withtestkey(struct polyval_ctx *ctx)
+{
+ polyval_init(ctx, &test_key);
+}
+
+static void polyval_withtestkey(const u8 *data, size_t len,
+ u8 out[POLYVAL_BLOCK_SIZE])
+{
+ polyval(&test_key, data, len, out);
+}
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH polyval_withtestkey
+#define HASH_CTX polyval_ctx
+#define HASH_SIZE POLYVAL_BLOCK_SIZE
+#define HASH_INIT polyval_init_withtestkey
+#define HASH_UPDATE polyval_update
+#define HASH_FINAL polyval_final
+#include "hash-test-template.h"
+
+/*
+ * Test an example from RFC8452 ("AES-GCM-SIV: Nonce Misuse-Resistant
+ * Authenticated Encryption") to ensure compatibility with that.
+ */
+static void test_polyval_rfc8452_testvec(struct kunit *test)
+{
+ static const u8 raw_key[POLYVAL_BLOCK_SIZE] =
+ "\x31\x07\x28\xd9\x91\x1f\x1f\x38"
+ "\x37\xb2\x43\x16\xc3\xfa\xb9\xa0";
+ static const u8 data[48] =
+ "\x65\x78\x61\x6d\x70\x6c\x65\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x48\x65\x6c\x6c\x6f\x20\x77\x6f"
+ "\x72\x6c\x64\x00\x00\x00\x00\x00"
+ "\x38\x00\x00\x00\x00\x00\x00\x00"
+ "\x58\x00\x00\x00\x00\x00\x00\x00";
+ static const u8 expected_hash[POLYVAL_BLOCK_SIZE] =
+ "\xad\x7f\xcf\x0b\x51\x69\x85\x16"
+ "\x62\x67\x2f\x3c\x5f\x95\x13\x8f";
+ u8 hash[POLYVAL_BLOCK_SIZE];
+ struct polyval_key key;
+
+ polyval_preparekey(&key, raw_key);
+ polyval(&key, data, sizeof(data), hash);
+ KUNIT_ASSERT_MEMEQ(test, hash, expected_hash, sizeof(hash));
+}
+
+/*
+ * Test a key and messages containing all one bits. This is useful to detect
+ * overflow bugs in implementations that emulate carryless multiplication using
+ * a series of standard multiplications with the bits spread out.
+ */
+static void test_polyval_allones_key_and_message(struct kunit *test)
+{
+ struct polyval_key key;
+ struct polyval_ctx hashofhashes_ctx;
+ u8 hash[POLYVAL_BLOCK_SIZE];
+
+ static_assert(TEST_BUF_LEN >= 4096);
+ memset(test_buf, 0xff, 4096);
+
+ polyval_preparekey(&key, test_buf);
+ polyval_init(&hashofhashes_ctx, &key);
+ for (size_t len = 0; len <= 4096; len += 16) {
+ polyval(&key, test_buf, len, hash);
+ polyval_update(&hashofhashes_ctx, hash, sizeof(hash));
+ }
+ polyval_final(&hashofhashes_ctx, hash);
+ KUNIT_ASSERT_MEMEQ(test, hash, polyval_allones_hashofhashes,
+ sizeof(hash));
+}
+
+#define MAX_LEN_FOR_KEY_CHECK 1024
+
+/*
+ * Given two prepared keys which should be identical (but may differ in
+ * alignment and/or whether they are followed by a guard page or not), verify
+ * that they produce consistent results on various data lengths.
+ */
+static void check_key_consistency(struct kunit *test,
+ const struct polyval_key *key1,
+ const struct polyval_key *key2)
+{
+ u8 *data = test_buf;
+ u8 hash1[POLYVAL_BLOCK_SIZE];
+ u8 hash2[POLYVAL_BLOCK_SIZE];
+
+ rand_bytes(data, MAX_LEN_FOR_KEY_CHECK);
+ KUNIT_ASSERT_MEMEQ(test, key1, key2, sizeof(*key1));
+
+ for (int i = 0; i < 100; i++) {
+ size_t len = rand_length(MAX_LEN_FOR_KEY_CHECK);
+
+ polyval(key1, data, len, hash1);
+ polyval(key2, data, len, hash2);
+ KUNIT_ASSERT_MEMEQ(test, hash1, hash2, sizeof(hash1));
+ }
+}
+
+/* Test that no buffer overreads occur on either raw_key or polyval_key. */
+static void test_polyval_with_guarded_key(struct kunit *test)
+{
+ u8 raw_key[POLYVAL_BLOCK_SIZE];
+ u8 *guarded_raw_key = &test_buf[TEST_BUF_LEN - sizeof(raw_key)];
+ struct polyval_key key1, key2;
+ struct polyval_key *guarded_key =
+ (struct polyval_key *)&test_buf[TEST_BUF_LEN - sizeof(key1)];
+
+ /* Prepare with regular buffers. */
+ rand_bytes(raw_key, sizeof(raw_key));
+ polyval_preparekey(&key1, raw_key);
+
+ /* Prepare with guarded raw_key, then check that it works. */
+ memcpy(guarded_raw_key, raw_key, sizeof(raw_key));
+ polyval_preparekey(&key2, guarded_raw_key);
+ check_key_consistency(test, &key1, &key2);
+
+ /* Prepare guarded polyval_key, then check that it works. */
+ polyval_preparekey(guarded_key, raw_key);
+ check_key_consistency(test, &key1, guarded_key);
+}
+
+/*
+ * Test that polyval_key only needs to be aligned to
+ * __alignof__(struct polyval_key), i.e. 8 bytes. The assembly code may prefer
+ * 16-byte or higher alignment, but it musn't require it.
+ */
+static void test_polyval_with_minimally_aligned_key(struct kunit *test)
+{
+ u8 raw_key[POLYVAL_BLOCK_SIZE];
+ struct polyval_key key;
+ struct polyval_key *minaligned_key =
+ (struct polyval_key *)&test_buf[MAX_LEN_FOR_KEY_CHECK +
+ __alignof__(struct polyval_key)];
+
+ KUNIT_ASSERT_TRUE(test, IS_ALIGNED((uintptr_t)minaligned_key,
+ __alignof__(struct polyval_key)));
+ KUNIT_ASSERT_TRUE(test,
+ !IS_ALIGNED((uintptr_t)minaligned_key,
+ 2 * __alignof__(struct polyval_key)));
+
+ rand_bytes(raw_key, sizeof(raw_key));
+ polyval_preparekey(&key, raw_key);
+ polyval_preparekey(minaligned_key, raw_key);
+ check_key_consistency(test, &key, minaligned_key);
+}
+
+struct polyval_irq_test_state {
+ struct polyval_key expected_key;
+ u8 raw_key[POLYVAL_BLOCK_SIZE];
+};
+
+static bool polyval_irq_test_func(void *state_)
+{
+ struct polyval_irq_test_state *state = state_;
+ struct polyval_key key;
+
+ polyval_preparekey(&key, state->raw_key);
+ return memcmp(&key, &state->expected_key, sizeof(key)) == 0;
+}
+
+/*
+ * Test that polyval_preparekey() produces the same output regardless of whether
+ * FPU or vector registers are usable when it is called.
+ */
+static void test_polyval_preparekey_in_irqs(struct kunit *test)
+{
+ struct polyval_irq_test_state state;
+
+ rand_bytes(state.raw_key, sizeof(state.raw_key));
+ polyval_preparekey(&state.expected_key, state.raw_key);
+ kunit_run_irq_test(test, polyval_irq_test_func, 20000, &state);
+}
+
+static int polyval_suite_init(struct kunit_suite *suite)
+{
+ u8 raw_key[POLYVAL_BLOCK_SIZE];
+
+ rand_bytes_seeded_from_len(raw_key, sizeof(raw_key));
+ polyval_preparekey(&test_key, raw_key);
+ return hash_suite_init(suite);
+}
+
+static void polyval_suite_exit(struct kunit_suite *suite)
+{
+ hash_suite_exit(suite);
+}
+
+static struct kunit_case polyval_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_polyval_rfc8452_testvec),
+ KUNIT_CASE(test_polyval_allones_key_and_message),
+ KUNIT_CASE(test_polyval_with_guarded_key),
+ KUNIT_CASE(test_polyval_with_minimally_aligned_key),
+ KUNIT_CASE(test_polyval_preparekey_in_irqs),
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite polyval_test_suite = {
+ .name = "polyval",
+ .test_cases = polyval_test_cases,
+ .suite_init = polyval_suite_init,
+ .suite_exit = polyval_suite_exit,
+};
+kunit_test_suite(polyval_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for POLYVAL");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha1-testvecs.h b/lib/crypto/tests/sha1-testvecs.h
new file mode 100644
index 000000000000..f5d050122e84
--- /dev/null
+++ b/lib/crypto/tests/sha1-testvecs.h
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha1 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA1_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d,
+ 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90,
+ 0xaf, 0xd8, 0x07, 0x09,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x0a, 0xd0, 0x52, 0xdd, 0x9f, 0x32, 0x40, 0x55,
+ 0x21, 0xe4, 0x3c, 0x6e, 0xbd, 0xc5, 0x2f, 0x5a,
+ 0x02, 0x54, 0x93, 0xb2,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x13, 0x83, 0x82, 0x03, 0x23, 0xff, 0x46, 0xd6,
+ 0x12, 0x7f, 0xad, 0x05, 0x2b, 0xc3, 0x4a, 0x42,
+ 0x49, 0x6a, 0xf8, 0x84,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0xe4, 0xdf, 0x7b, 0xdc, 0xe8, 0x6e, 0x81, 0x97,
+ 0x1e, 0x0f, 0xe8, 0x8b, 0x76, 0xa8, 0x59, 0x04,
+ 0xae, 0x92, 0x1a, 0x7c,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x8c, 0x1c, 0x30, 0xd8, 0xbc, 0xc4, 0xc3, 0xf5,
+ 0xf8, 0x83, 0x0d, 0x1e, 0x04, 0x5d, 0x29, 0xb5,
+ 0x68, 0x89, 0xc1, 0xe9,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x6c, 0x1d, 0x72, 0x31, 0xa5, 0x03, 0x4f, 0xdc,
+ 0xff, 0x2d, 0x06, 0x3e, 0x24, 0x26, 0x34, 0x8d,
+ 0x60, 0xa4, 0x67, 0x16,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x37, 0x53, 0x33, 0xfa, 0xd0, 0x21, 0xad, 0xe7,
+ 0xa5, 0x43, 0xf1, 0x94, 0x64, 0x11, 0x47, 0x9c,
+ 0x72, 0xb5, 0x78, 0xb4,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x51, 0x5c, 0xd8, 0x5a, 0xa9, 0xde, 0x7b, 0x2a,
+ 0xa2, 0xff, 0x70, 0x09, 0x56, 0x88, 0x40, 0x2b,
+ 0x50, 0x93, 0x82, 0x47,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0xbc, 0x9c, 0xab, 0x93, 0x06, 0xd5, 0xdb, 0xac,
+ 0x2c, 0x33, 0x15, 0x83, 0x56, 0xf6, 0x91, 0x20,
+ 0x09, 0xc7, 0xb2, 0x6b,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x26, 0x90, 0x3b, 0x47, 0xe1, 0x92, 0x42, 0xd0,
+ 0x85, 0x63, 0x2e, 0x6b, 0x68, 0xa4, 0xc4, 0x4c,
+ 0xe6, 0xf4, 0xb0, 0x52,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x55, 0x6f, 0x87, 0xdc, 0x34, 0x3d, 0xe2, 0x4f,
+ 0xc3, 0x81, 0xa4, 0x82, 0x79, 0x84, 0x64, 0x01,
+ 0x55, 0xa0, 0x1e, 0x36,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0xb7, 0xd5, 0x5f, 0xa4, 0xef, 0xbf, 0x4f, 0x96,
+ 0x01, 0xc1, 0x06, 0xe3, 0x75, 0xa8, 0x90, 0x92,
+ 0x4c, 0x5f, 0xf1, 0x21,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x70, 0x0c, 0xea, 0xa4, 0x93, 0xd0, 0x56, 0xf0,
+ 0x6f, 0xbb, 0x53, 0x42, 0x5b, 0xe3, 0xf2, 0xb0,
+ 0x30, 0x66, 0x8e, 0x75,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x15, 0x01, 0xbc, 0xb0, 0xee, 0xd8, 0xeb, 0xa8,
+ 0x7d, 0xd9, 0x4d, 0x50, 0x2e, 0x41, 0x30, 0xba,
+ 0x41, 0xaa, 0x7b, 0x02,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x98, 0x05, 0x52, 0xf5, 0x0f, 0xf0, 0xd3, 0x97,
+ 0x15, 0x8c, 0xa3, 0x9a, 0x2b, 0x4d, 0x67, 0x57,
+ 0x29, 0xa0, 0xac, 0x61,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x1f, 0x47, 0xf0, 0xcc, 0xd7, 0xda, 0xa5, 0x3b,
+ 0x39, 0xb4, 0x5b, 0xa8, 0x33, 0xd4, 0xca, 0x2f,
+ 0xdd, 0xf2, 0x39, 0x89,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0xb9, 0x75, 0xe6, 0x57, 0x42, 0x7f, 0x8b, 0x0a,
+ 0xcc, 0x53, 0x10, 0x69, 0x45, 0xac, 0xfd, 0x11,
+ 0xf7, 0x1f, 0x4e, 0x6f,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x63, 0x66, 0xcb, 0x44, 0xc1, 0x2c, 0xa2, 0x06,
+ 0x5d, 0xb9, 0x8e, 0x31, 0xcb, 0x4f, 0x4e, 0x49,
+ 0xe0, 0xfb, 0x3c, 0x4e,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x35, 0xbc, 0x74, 0xfb, 0x31, 0x9c, 0xd4, 0xdd,
+ 0xe8, 0x87, 0xa7, 0x56, 0x3b, 0x08, 0xe5, 0x49,
+ 0xe1, 0xe9, 0xc9, 0xa8,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x43, 0x00, 0xea, 0xcd, 0x4e, 0x7c, 0xe9, 0xe4,
+ 0x32, 0xce, 0x25, 0xa8, 0xcd, 0x20, 0xa8, 0xaa,
+ 0x7b, 0x63, 0x2c, 0x3c,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0xd0, 0x67, 0x26, 0x0e, 0x22, 0x72, 0xaa, 0x63,
+ 0xfc, 0x34, 0x55, 0x07, 0xab, 0xc8, 0x64, 0xb6,
+ 0xc4, 0xea, 0xd5, 0x7c,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x6b, 0xc9, 0x5e, 0xb9, 0x41, 0x19, 0x50, 0x35,
+ 0xf1, 0x39, 0xfe, 0xd9, 0x72, 0x6d, 0xd0, 0x55,
+ 0xb8, 0x1f, 0x1a, 0x95,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x70, 0x5d, 0x10, 0x2e, 0x4e, 0x44, 0xc9, 0x80,
+ 0x8f, 0xba, 0x13, 0xbc, 0xd0, 0x77, 0x78, 0xc7,
+ 0x84, 0xe3, 0x24, 0x43,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xa8, 0x82, 0xca, 0x08, 0xb4, 0x84, 0x09, 0x13,
+ 0xc0, 0x9c, 0x26, 0x18, 0xcf, 0x0f, 0xf3, 0x08,
+ 0xff, 0xa1, 0xe4, 0x5d,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA1_DIGEST_SIZE] = {
+ 0xe1, 0x72, 0xa5, 0x3c, 0xda, 0xf2, 0xe5, 0x56,
+ 0xb8, 0xb5, 0x35, 0x6e, 0xce, 0xc8, 0x37, 0x57,
+ 0x31, 0xb4, 0x05, 0xdd,
+};
+
+static const u8 hmac_testvec_consolidated[SHA1_DIGEST_SIZE] = {
+ 0x9d, 0xe5, 0xb1, 0x43, 0x97, 0x95, 0x16, 0x52,
+ 0xa0, 0x7a, 0xc0, 0xe2, 0xc1, 0x60, 0x64, 0x7c,
+ 0x24, 0xf9, 0x34, 0xd7,
+};
diff --git a/lib/crypto/tests/sha1_kunit.c b/lib/crypto/tests/sha1_kunit.c
new file mode 100644
index 000000000000..24ba8d5669c8
--- /dev/null
+++ b/lib/crypto/tests/sha1_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha1.h>
+#include "sha1-testvecs.h"
+
+#define HASH sha1
+#define HASH_CTX sha1_ctx
+#define HASH_SIZE SHA1_DIGEST_SIZE
+#define HASH_INIT sha1_init
+#define HASH_UPDATE sha1_update
+#define HASH_FINAL sha1_final
+#define HMAC_KEY hmac_sha1_key
+#define HMAC_CTX hmac_sha1_ctx
+#define HMAC_PREPAREKEY hmac_sha1_preparekey
+#define HMAC_INIT hmac_sha1_init
+#define HMAC_UPDATE hmac_sha1_update
+#define HMAC_FINAL hmac_sha1_final
+#define HMAC hmac_sha1
+#define HMAC_USINGRAWKEY hmac_sha1_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "sha1",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-1 and HMAC-SHA1");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha224-testvecs.h b/lib/crypto/tests/sha224-testvecs.h
new file mode 100644
index 000000000000..523adc178e1a
--- /dev/null
+++ b/lib/crypto/tests/sha224-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha224 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA224_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9,
+ 0x47, 0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4,
+ 0x15, 0xa2, 0xb0, 0x1f, 0x82, 0x8e, 0xa6, 0x2a,
+ 0xc5, 0xb3, 0xe4, 0x2f,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0xe3, 0x4d, 0x79, 0x17, 0x75, 0x35, 0xdc, 0xd2,
+ 0x27, 0xc9, 0x9d, 0x0b, 0x90, 0x0f, 0x21, 0x5d,
+ 0x95, 0xfb, 0x9c, 0x6d, 0xa8, 0xec, 0x19, 0x15,
+ 0x12, 0xef, 0xf5, 0x0f,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x81, 0xc7, 0x60, 0x0d, 0x6d, 0x13, 0x75, 0x70,
+ 0x4b, 0xc0, 0xab, 0xea, 0x04, 0xe3, 0x78, 0x7e,
+ 0x73, 0xb9, 0x0f, 0xb6, 0xae, 0x90, 0xf3, 0x94,
+ 0xb2, 0x56, 0xda, 0xc8,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x24, 0xf0, 0x8c, 0x6e, 0x9d, 0xd6, 0x06, 0x80,
+ 0x0a, 0x03, 0xee, 0x9b, 0x33, 0xec, 0x83, 0x42,
+ 0x2c, 0x8b, 0xe7, 0xc7, 0xc6, 0x04, 0xfb, 0xc6,
+ 0xa3, 0x3a, 0x4d, 0xc9,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x1c, 0x08, 0xa8, 0x55, 0x8f, 0xc6, 0x0a, 0xea,
+ 0x2f, 0x1b, 0x54, 0xff, 0x8d, 0xd2, 0xa3, 0xc7,
+ 0x42, 0xc2, 0x93, 0x3d, 0x73, 0x18, 0x84, 0xba,
+ 0x75, 0x49, 0x34, 0xfd,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x45, 0xdd, 0xb5, 0xf0, 0x3c, 0xda, 0xe6, 0xd4,
+ 0x6c, 0x86, 0x91, 0x29, 0x11, 0x2f, 0x88, 0x7d,
+ 0xd8, 0x3c, 0xa3, 0xd6, 0xdd, 0x1e, 0xac, 0x98,
+ 0xff, 0xf0, 0x14, 0x69,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x0b, 0xfb, 0x71, 0x4c, 0x06, 0x7a, 0xd5, 0x89,
+ 0x76, 0x0a, 0x43, 0x8b, 0x2b, 0x47, 0x12, 0x56,
+ 0xa7, 0x64, 0x33, 0x1d, 0xd3, 0x44, 0x17, 0x95,
+ 0x23, 0xe7, 0x53, 0x01,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0xc4, 0xae, 0x9c, 0x33, 0xd5, 0x1d, 0xf4, 0xa7,
+ 0xfd, 0xb7, 0xd4, 0x6b, 0xc3, 0xeb, 0xa8, 0xbf,
+ 0xfb, 0x07, 0x89, 0x4b, 0x07, 0x15, 0x22, 0xec,
+ 0xe1, 0x45, 0x84, 0xba,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0xad, 0x01, 0x34, 0x2a, 0xe2, 0x3b, 0x58, 0x06,
+ 0x9f, 0x20, 0xc8, 0xfb, 0xf3, 0x20, 0x82, 0xa6,
+ 0x9f, 0xee, 0x7a, 0xbe, 0xdf, 0xf3, 0x5d, 0x57,
+ 0x9b, 0xce, 0x79, 0x96,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0xa7, 0xa6, 0x47, 0xf7, 0xed, 0x2a, 0xe5, 0xe3,
+ 0xc0, 0x1e, 0x7b, 0x40, 0xe4, 0xf7, 0x40, 0x65,
+ 0x42, 0xc1, 0x6f, 0x7d, 0x8d, 0x0d, 0x17, 0x4f,
+ 0xd3, 0xbc, 0x0d, 0x85,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0xc4, 0x9c, 0xb5, 0x6a, 0x01, 0x2d, 0x10, 0xa9,
+ 0x5f, 0xa4, 0x5a, 0xe1, 0xba, 0x40, 0x12, 0x09,
+ 0x7b, 0xea, 0xdb, 0xa6, 0x7b, 0xcb, 0x56, 0xf0,
+ 0xfd, 0x5b, 0xe2, 0xe7,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x14, 0xda, 0x0e, 0x01, 0xca, 0x78, 0x7d, 0x2d,
+ 0x85, 0xa3, 0xca, 0x0e, 0x80, 0xf9, 0x95, 0x10,
+ 0xa1, 0x7b, 0xa5, 0xaa, 0xfc, 0x95, 0x05, 0x08,
+ 0x53, 0xda, 0x52, 0xee,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0xa5, 0x24, 0xc4, 0x54, 0xe1, 0x50, 0xab, 0xee,
+ 0x22, 0xc1, 0xa7, 0x27, 0x15, 0x2c, 0x6f, 0xf7,
+ 0x4c, 0x31, 0xe5, 0x15, 0x25, 0x4e, 0x71, 0xc6,
+ 0x7e, 0xa0, 0x11, 0x5d,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x73, 0xd0, 0x8c, 0xce, 0xed, 0xed, 0x9f, 0xaa,
+ 0x21, 0xaf, 0xa2, 0x08, 0x80, 0x16, 0x15, 0x59,
+ 0x3f, 0x1d, 0x7f, 0x0a, 0x79, 0x3d, 0x7b, 0x58,
+ 0xf8, 0xc8, 0x5c, 0x27,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x31, 0xa7, 0xa1, 0xca, 0x49, 0x72, 0x75, 0xcc,
+ 0x6e, 0x02, 0x9e, 0xad, 0xea, 0x86, 0x5c, 0x91,
+ 0x02, 0xe4, 0xc9, 0xf9, 0xd3, 0x9e, 0x74, 0x50,
+ 0xd8, 0x43, 0x6b, 0x85,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x40, 0x60, 0x8b, 0xb0, 0x03, 0xa9, 0x75, 0xab,
+ 0x2d, 0x5b, 0x20, 0x9a, 0x05, 0x72, 0xb7, 0xa8,
+ 0xce, 0xf2, 0x4f, 0x66, 0x62, 0xe3, 0x7e, 0x24,
+ 0xd6, 0xe2, 0xea, 0xfa,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x4f, 0x5f, 0x9f, 0x1e, 0xb3, 0x66, 0x81, 0xdb,
+ 0x41, 0x5d, 0x65, 0x97, 0x00, 0x8d, 0xdc, 0x62,
+ 0x03, 0xb0, 0x4d, 0x6b, 0x5c, 0x7f, 0x1e, 0xa0,
+ 0xfe, 0xfc, 0x0e, 0xb8,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x08, 0xa8, 0xa1, 0xc0, 0xd8, 0xf9, 0xb4, 0xaa,
+ 0x53, 0x22, 0xa1, 0x73, 0x0b, 0x45, 0xa0, 0x20,
+ 0x72, 0xf3, 0xa9, 0xbc, 0x51, 0xd0, 0x20, 0x79,
+ 0x69, 0x97, 0xf7, 0xe3,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0xe8, 0x60, 0x5f, 0xb9, 0x12, 0xe1, 0x6b, 0x24,
+ 0xc5, 0xe8, 0x43, 0xa9, 0x5c, 0x3f, 0x65, 0xed,
+ 0xbe, 0xfd, 0x77, 0xf5, 0x47, 0xf2, 0x75, 0x21,
+ 0xc2, 0x8f, 0x54, 0x8f,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0xc7, 0xdf, 0x50, 0x16, 0x10, 0x01, 0xb7, 0xdf,
+ 0x34, 0x1d, 0x18, 0xa2, 0xd5, 0xad, 0x1f, 0x50,
+ 0xf7, 0xa8, 0x9a, 0x72, 0xfb, 0xfd, 0xd9, 0x1c,
+ 0x57, 0xac, 0x08, 0x97,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0xdf, 0x16, 0x76, 0x7f, 0xc0, 0x16, 0x84, 0x63,
+ 0xac, 0xcf, 0xd0, 0x78, 0x1e, 0x96, 0x67, 0xc5,
+ 0x3c, 0x06, 0xe9, 0xdb, 0x6e, 0x7d, 0xd0, 0x07,
+ 0xaa, 0xb1, 0x56, 0xc9,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x49, 0xec, 0x5c, 0x18, 0xd7, 0x5b, 0xda, 0xed,
+ 0x5b, 0x59, 0xde, 0x09, 0x34, 0xb2, 0x49, 0x43,
+ 0x62, 0x6a, 0x0a, 0x63, 0x6a, 0x51, 0x08, 0x37,
+ 0x8c, 0xb6, 0x29, 0x84,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x3d, 0xc2, 0xc8, 0x43, 0xcf, 0xb7, 0x33, 0x14,
+ 0x04, 0x93, 0xed, 0xe2, 0xcd, 0x8a, 0x69, 0x5c,
+ 0x5a, 0xd5, 0x9b, 0x52, 0xdf, 0x48, 0xa7, 0xaa,
+ 0x28, 0x2b, 0x5d, 0x27,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xa7, 0xaf, 0xda, 0x92, 0xe2, 0xe7, 0x61, 0xdc,
+ 0xa1, 0x32, 0x53, 0x2a, 0x3f, 0x41, 0x5c, 0x7e,
+ 0xc9, 0x89, 0xda, 0x1c, 0xf7, 0x8d, 0x00, 0xbd,
+ 0x21, 0x73, 0xb1, 0x69,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA224_DIGEST_SIZE] = {
+ 0x9e, 0xb8, 0x82, 0xab, 0x83, 0x37, 0xe4, 0x63,
+ 0x84, 0xee, 0x21, 0x15, 0xc2, 0xbb, 0xa3, 0x17,
+ 0x8f, 0xc4, 0x99, 0x33, 0xa0, 0x2c, 0x9f, 0xec,
+ 0xca, 0xd0, 0xf3, 0x73,
+};
+
+static const u8 hmac_testvec_consolidated[SHA224_DIGEST_SIZE] = {
+ 0x66, 0x34, 0x79, 0x92, 0x47, 0x0e, 0xcd, 0x70,
+ 0xb0, 0x8b, 0x91, 0xcb, 0x94, 0x2f, 0x67, 0x65,
+ 0x2f, 0xc9, 0xd2, 0x91, 0x32, 0xaf, 0xf7, 0x5f,
+ 0xb6, 0x01, 0x5b, 0xf2,
+};
diff --git a/lib/crypto/tests/sha224_kunit.c b/lib/crypto/tests/sha224_kunit.c
new file mode 100644
index 000000000000..962ad46b9c99
--- /dev/null
+++ b/lib/crypto/tests/sha224_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha224-testvecs.h"
+
+#define HASH sha224
+#define HASH_CTX sha224_ctx
+#define HASH_SIZE SHA224_DIGEST_SIZE
+#define HASH_INIT sha224_init
+#define HASH_UPDATE sha224_update
+#define HASH_FINAL sha224_final
+#define HMAC_KEY hmac_sha224_key
+#define HMAC_CTX hmac_sha224_ctx
+#define HMAC_PREPAREKEY hmac_sha224_preparekey
+#define HMAC_INIT hmac_sha224_init
+#define HMAC_UPDATE hmac_sha224_update
+#define HMAC_FINAL hmac_sha224_final
+#define HMAC hmac_sha224
+#define HMAC_USINGRAWKEY hmac_sha224_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "sha224",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-224 and HMAC-SHA224");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha256-testvecs.h b/lib/crypto/tests/sha256-testvecs.h
new file mode 100644
index 000000000000..2db7b2042034
--- /dev/null
+++ b/lib/crypto/tests/sha256-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha256 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA256_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14,
+ 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24,
+ 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c,
+ 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x45, 0xf8, 0x3d, 0x17, 0xe1, 0x0b, 0x34, 0xfc,
+ 0xa0, 0x1e, 0xb8, 0xf4, 0x45, 0x4d, 0xac, 0x34,
+ 0xa7, 0x77, 0xd9, 0x40, 0x4a, 0x46, 0x4e, 0x73,
+ 0x2c, 0xf4, 0xab, 0xf2, 0xc0, 0xda, 0x94, 0xc4,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0xf9, 0xd3, 0x52, 0x2f, 0xd5, 0xe0, 0x99, 0x15,
+ 0x1c, 0xd6, 0xa9, 0x24, 0x4f, 0x40, 0xba, 0x25,
+ 0x33, 0x43, 0x3e, 0xe1, 0x78, 0x6a, 0xfe, 0x7d,
+ 0x07, 0xe2, 0x29, 0x7b, 0x6d, 0xc5, 0x73, 0xf5,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x71, 0xf7, 0xa1, 0xef, 0x69, 0x86, 0x0e, 0xe4,
+ 0x87, 0x25, 0x58, 0x4c, 0x07, 0x2c, 0xfc, 0x60,
+ 0xc5, 0xf6, 0xe2, 0x44, 0xaa, 0xfb, 0x41, 0xc7,
+ 0x2b, 0xc5, 0x01, 0x8c, 0x39, 0x98, 0x30, 0x37,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x09, 0x95, 0x9a, 0xfa, 0x25, 0x18, 0x86, 0x06,
+ 0xfe, 0x65, 0xc9, 0x2f, 0x91, 0x15, 0x74, 0x06,
+ 0x6c, 0xbf, 0xef, 0x7b, 0x0b, 0xc7, 0x2c, 0x05,
+ 0xdd, 0x17, 0x5d, 0x6f, 0x8a, 0xa5, 0xde, 0x3c,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0xe5, 0x52, 0x3c, 0x85, 0xea, 0x1b, 0xe1, 0x6c,
+ 0xe0, 0xdb, 0xc3, 0xef, 0xf0, 0xca, 0xc2, 0xe1,
+ 0xb9, 0x36, 0xa1, 0x28, 0xb6, 0x9e, 0xf5, 0x6e,
+ 0x70, 0xf7, 0xf9, 0xa7, 0x1c, 0xd3, 0x22, 0xd0,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x5f, 0x84, 0xd4, 0xd7, 0x2e, 0x80, 0x09, 0xef,
+ 0x1c, 0x77, 0x7c, 0x25, 0x59, 0x63, 0x88, 0x64,
+ 0xfd, 0x56, 0xea, 0x23, 0xf4, 0x4f, 0x2e, 0x49,
+ 0xcd, 0xb4, 0xaa, 0xc7, 0x5c, 0x8b, 0x75, 0x84,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x22, 0x6e, 0xca, 0xda, 0x00, 0x2d, 0x90, 0x96,
+ 0x24, 0xf8, 0x55, 0x17, 0x11, 0xda, 0x42, 0x1c,
+ 0x78, 0x4e, 0xbf, 0xd9, 0xc5, 0xcf, 0xf3, 0xe3,
+ 0xaf, 0xd3, 0x60, 0xcd, 0xaa, 0xe2, 0xc7, 0x22,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x97, 0xe2, 0x74, 0xdc, 0x6b, 0xa4, 0xaf, 0x32,
+ 0x3b, 0x50, 0x6d, 0x80, 0xb5, 0xd3, 0x0c, 0x36,
+ 0xea, 0x3f, 0x5d, 0x36, 0xa7, 0x49, 0x51, 0xf3,
+ 0xbd, 0x69, 0x68, 0x60, 0x9b, 0xde, 0x73, 0xf5,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x13, 0x74, 0xb1, 0x72, 0xd6, 0x53, 0x48, 0x28,
+ 0x42, 0xd8, 0xba, 0x64, 0x20, 0x60, 0xb6, 0x4c,
+ 0xc3, 0xac, 0x5d, 0x93, 0x8c, 0xb9, 0xd4, 0xcc,
+ 0xb4, 0x9f, 0x31, 0x1f, 0xeb, 0x68, 0x35, 0x58,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0xda, 0xbe, 0xd7, 0xbc, 0x6e, 0xe6, 0x5a, 0x57,
+ 0xeb, 0x9a, 0x93, 0xaa, 0x66, 0xd0, 0xe0, 0xc4,
+ 0x29, 0x7f, 0xe9, 0x3b, 0x8e, 0xdf, 0x81, 0x82,
+ 0x8d, 0x15, 0x11, 0x59, 0x4e, 0x13, 0xa5, 0x58,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x8c, 0x1a, 0xba, 0x40, 0x66, 0x94, 0x19, 0xf4,
+ 0x2e, 0xa2, 0xae, 0x94, 0x53, 0x18, 0xb6, 0xfd,
+ 0xa0, 0x12, 0xc5, 0xef, 0xd5, 0xd6, 0x1b, 0xa1,
+ 0x37, 0xea, 0x19, 0x44, 0x35, 0x54, 0x85, 0x74,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0xfd, 0x07, 0xd8, 0x77, 0x7d, 0x8b, 0x4f, 0xee,
+ 0x60, 0x60, 0x26, 0xef, 0x2a, 0x86, 0xfb, 0x67,
+ 0xeb, 0x31, 0x27, 0x03, 0x99, 0x3c, 0xde, 0xe5,
+ 0x84, 0x72, 0x71, 0x4c, 0x33, 0x7b, 0x87, 0x13,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x97, 0xc5, 0x58, 0x38, 0x20, 0xc7, 0xde, 0xfa,
+ 0xdd, 0x9b, 0x10, 0xc6, 0xc2, 0x2f, 0x94, 0xb5,
+ 0xc0, 0x33, 0xc0, 0x20, 0x1c, 0x2f, 0xb4, 0x28,
+ 0x5e, 0x36, 0xfa, 0x8c, 0x24, 0x1c, 0x18, 0x27,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x62, 0x17, 0x84, 0x26, 0x98, 0x30, 0x57, 0xca,
+ 0x4f, 0x32, 0xd9, 0x09, 0x09, 0x34, 0xe2, 0xcb,
+ 0x92, 0x45, 0xd5, 0xeb, 0x8b, 0x9b, 0x3c, 0xd8,
+ 0xaa, 0xc7, 0xd2, 0x2b, 0x04, 0xab, 0xb3, 0x35,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x7f, 0xe1, 0x09, 0x78, 0x5d, 0x61, 0xfa, 0x5e,
+ 0x9b, 0x8c, 0xb1, 0xa9, 0x09, 0x69, 0xb4, 0x24,
+ 0x54, 0xf2, 0x1c, 0xc9, 0x5f, 0xfb, 0x59, 0x9d,
+ 0x36, 0x1b, 0x37, 0x44, 0xfc, 0x64, 0x79, 0xb6,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0xd2, 0x3b, 0x3a, 0xe7, 0x13, 0x4f, 0xbd, 0x29,
+ 0x6b, 0xd2, 0x79, 0x26, 0x6c, 0xd2, 0x22, 0x43,
+ 0x25, 0x34, 0x9b, 0x9b, 0x22, 0xb0, 0x9f, 0x61,
+ 0x1d, 0xf4, 0xe2, 0x65, 0x68, 0x95, 0x02, 0x6c,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x0c, 0x34, 0x53, 0x3f, 0x0f, 0x8a, 0x39, 0x8d,
+ 0x63, 0xe4, 0x83, 0x6e, 0x11, 0x7d, 0x14, 0x8e,
+ 0x5b, 0xf0, 0x4d, 0xca, 0x23, 0x24, 0xb5, 0xd2,
+ 0x13, 0x3f, 0xd9, 0xde, 0x84, 0x74, 0x26, 0x59,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0xa8, 0xb8, 0x83, 0x01, 0x1b, 0x38, 0x7a, 0xca,
+ 0x59, 0xe9, 0x5b, 0x37, 0x6a, 0xab, 0xb4, 0x85,
+ 0x94, 0x73, 0x26, 0x04, 0xef, 0xed, 0xf4, 0x0d,
+ 0xd6, 0x09, 0x21, 0x09, 0x96, 0x78, 0xe3, 0xcf,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x0b, 0x12, 0x66, 0x96, 0x78, 0x4f, 0x2c, 0x35,
+ 0xa4, 0xed, 0xbc, 0xb8, 0x30, 0xa6, 0x37, 0x9b,
+ 0x94, 0x13, 0xae, 0x86, 0xf0, 0x20, 0xfb, 0x49,
+ 0x8f, 0x5d, 0x20, 0x70, 0x60, 0x2b, 0x02, 0x70,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0xe4, 0xbd, 0xe4, 0x3b, 0x85, 0xf4, 0x6f, 0x11,
+ 0xad, 0xc4, 0x79, 0xcc, 0x8e, 0x6d, 0x8b, 0x15,
+ 0xbb, 0xf9, 0xd3, 0x65, 0xe1, 0xf8, 0x8d, 0x22,
+ 0x65, 0x66, 0x66, 0xb3, 0xf5, 0xd0, 0x9c, 0xaf,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x90, 0x5f, 0xe0, 0xfc, 0xb1, 0xdc, 0x38, 0x1b,
+ 0xe5, 0x37, 0x3f, 0xd2, 0xcc, 0x48, 0xc4, 0xbc,
+ 0xb4, 0xfd, 0xf7, 0x71, 0x5f, 0x6b, 0xf4, 0xc4,
+ 0xa6, 0x08, 0x7e, 0xfc, 0x4e, 0x96, 0xf7, 0xc2,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x1f, 0x34, 0x0a, 0x3b, 0xdb, 0xf7, 0x7a, 0xdb,
+ 0x3d, 0x89, 0x85, 0x0c, 0xd2, 0xf0, 0x0c, 0xbd,
+ 0x25, 0x39, 0x14, 0x06, 0x28, 0x0f, 0x6b, 0x5f,
+ 0xe3, 0x1f, 0x2a, 0xb6, 0xca, 0x56, 0x41, 0xa1,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0x7b, 0x01, 0x2d, 0x84, 0x70, 0xee, 0xe0, 0x77,
+ 0x3c, 0x17, 0x63, 0xfe, 0x40, 0xd7, 0xfd, 0xa1,
+ 0x75, 0x90, 0xb8, 0x3e, 0x50, 0xcd, 0x06, 0xb7,
+ 0xb9, 0xb9, 0x2b, 0x91, 0x4f, 0xba, 0xe4, 0x4c,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA256_DIGEST_SIZE] = {
+ 0x78, 0x1c, 0xb1, 0x9f, 0xe5, 0xe1, 0xcb, 0x41,
+ 0x8b, 0x34, 0x00, 0x33, 0x57, 0xc3, 0x1c, 0x8f,
+ 0x5c, 0x84, 0xc7, 0x8b, 0x87, 0x6a, 0x13, 0x29,
+ 0x2f, 0xc4, 0x1a, 0x70, 0x5e, 0x40, 0xf2, 0xfe,
+};
+
+static const u8 hmac_testvec_consolidated[SHA256_DIGEST_SIZE] = {
+ 0x56, 0x96, 0x2e, 0x23, 0x3f, 0x94, 0x89, 0x0d,
+ 0x0f, 0x24, 0x36, 0x2e, 0x19, 0x3d, 0xb5, 0xac,
+ 0xb8, 0xcd, 0xf1, 0xc9, 0xca, 0xac, 0xee, 0x9d,
+ 0x62, 0xe6, 0x81, 0xe5, 0x96, 0xf9, 0x38, 0xf5,
+};
diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c
new file mode 100644
index 000000000000..5dccdee79693
--- /dev/null
+++ b/lib/crypto/tests/sha256_kunit.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha256-testvecs.h"
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH sha256
+#define HASH_CTX sha256_ctx
+#define HASH_SIZE SHA256_DIGEST_SIZE
+#define HASH_INIT sha256_init
+#define HASH_UPDATE sha256_update
+#define HASH_FINAL sha256_final
+#define HMAC_KEY hmac_sha256_key
+#define HMAC_CTX hmac_sha256_ctx
+#define HMAC_PREPAREKEY hmac_sha256_preparekey
+#define HMAC_INIT hmac_sha256_init
+#define HMAC_UPDATE hmac_sha256_update
+#define HMAC_FINAL hmac_sha256_final
+#define HMAC hmac_sha256
+#define HMAC_USINGRAWKEY hmac_sha256_usingrawkey
+#include "hash-test-template.h"
+
+static void free_guarded_buf(void *buf)
+{
+ vfree(buf);
+}
+
+/*
+ * Allocate a KUnit-managed buffer that has length @len bytes immediately
+ * followed by an unmapped page, and assert that the allocation succeeds.
+ */
+static void *alloc_guarded_buf(struct kunit *test, size_t len)
+{
+ size_t full_len = round_up(len, PAGE_SIZE);
+ void *buf = vmalloc(full_len);
+
+ KUNIT_ASSERT_NOT_NULL(test, buf);
+ KUNIT_ASSERT_EQ(test, 0,
+ kunit_add_action_or_reset(test, free_guarded_buf, buf));
+ return buf + full_len - len;
+}
+
+/*
+ * Test for sha256_finup_2x(). Specifically, choose various data lengths and
+ * salt lengths, and for each one, verify that sha256_finup_2x() produces the
+ * same results as sha256_update() and sha256_final().
+ *
+ * Use guarded buffers for all inputs and outputs to reliably detect any
+ * out-of-bounds reads or writes, even if they occur in assembly code.
+ */
+static void test_sha256_finup_2x(struct kunit *test)
+{
+ const size_t max_data_len = 16384;
+ u8 *data1_buf, *data2_buf, *hash1, *hash2;
+ u8 expected_hash1[SHA256_DIGEST_SIZE];
+ u8 expected_hash2[SHA256_DIGEST_SIZE];
+ u8 salt[SHA256_BLOCK_SIZE];
+ struct sha256_ctx *ctx;
+
+ data1_buf = alloc_guarded_buf(test, max_data_len);
+ data2_buf = alloc_guarded_buf(test, max_data_len);
+ hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+ hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+ ctx = alloc_guarded_buf(test, sizeof(*ctx));
+
+ rand_bytes(data1_buf, max_data_len);
+ rand_bytes(data2_buf, max_data_len);
+ rand_bytes(salt, sizeof(salt));
+ memset(ctx, 0, sizeof(*ctx));
+
+ for (size_t i = 0; i < 500; i++) {
+ size_t salt_len = rand_length(sizeof(salt));
+ size_t data_len = rand_length(max_data_len);
+ const u8 *data1 = data1_buf + max_data_len - data_len;
+ const u8 *data2 = data2_buf + max_data_len - data_len;
+ struct sha256_ctx orig_ctx;
+
+ sha256_init(ctx);
+ sha256_update(ctx, salt, salt_len);
+ orig_ctx = *ctx;
+
+ sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, ctx, &orig_ctx, sizeof(*ctx),
+ "sha256_finup_2x() modified its ctx argument");
+
+ sha256_update(ctx, data1, data_len);
+ sha256_final(ctx, expected_hash1);
+ sha256_update(&orig_ctx, data2, data_len);
+ sha256_final(&orig_ctx, expected_hash2);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash1, expected_hash1, SHA256_DIGEST_SIZE,
+ "Wrong hash1 with salt_len=%zu data_len=%zu", salt_len,
+ data_len);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash2, expected_hash2, SHA256_DIGEST_SIZE,
+ "Wrong hash2 with salt_len=%zu data_len=%zu", salt_len,
+ data_len);
+ }
+}
+
+/* Test sha256_finup_2x() with ctx == NULL */
+static void test_sha256_finup_2x_defaultctx(struct kunit *test)
+{
+ const size_t data_len = 128;
+ struct sha256_ctx ctx;
+ u8 hash1_a[SHA256_DIGEST_SIZE];
+ u8 hash2_a[SHA256_DIGEST_SIZE];
+ u8 hash1_b[SHA256_DIGEST_SIZE];
+ u8 hash2_b[SHA256_DIGEST_SIZE];
+
+ rand_bytes(test_buf, 2 * data_len);
+
+ sha256_init(&ctx);
+ sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a,
+ hash2_a);
+
+ sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b,
+ hash2_b);
+
+ KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE);
+ KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE);
+}
+
+/*
+ * Test that sha256_finup_2x() and sha256_update/final() produce consistent
+ * results with total message lengths that require more than 32 bits.
+ */
+static void test_sha256_finup_2x_hugelen(struct kunit *test)
+{
+ const size_t data_len = 4 * SHA256_BLOCK_SIZE;
+ struct sha256_ctx ctx = {};
+ u8 expected_hash[SHA256_DIGEST_SIZE];
+ u8 hash[SHA256_DIGEST_SIZE];
+
+ rand_bytes(test_buf, data_len);
+ for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) {
+ sha256_init(&ctx);
+ ctx.ctx.bytecount = 0x123456789abcd00 + align;
+
+ sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash);
+
+ sha256_update(&ctx, test_buf, data_len);
+ sha256_final(&ctx, expected_hash);
+
+ KUNIT_ASSERT_MEMEQ(test, hash, expected_hash,
+ SHA256_DIGEST_SIZE);
+ }
+}
+
+/* Benchmark for sha256_finup_2x() */
+static void benchmark_sha256_finup_2x(struct kunit *test)
+{
+ /*
+ * Try a few different salt lengths, since sha256_finup_2x() performance
+ * may vary slightly for the same data_len depending on how many bytes
+ * were already processed in the initial context.
+ */
+ static const size_t salt_lens_to_test[] = { 0, 32, 64 };
+ const size_t data_len = 4096;
+ const size_t num_iters = 4096;
+ struct sha256_ctx ctx;
+ u8 hash1[SHA256_DIGEST_SIZE];
+ u8 hash2[SHA256_DIGEST_SIZE];
+
+ if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+ kunit_skip(test, "not enabled");
+ if (!sha256_finup_2x_is_optimized())
+ kunit_skip(test, "not relevant");
+
+ rand_bytes(test_buf, data_len * 2);
+
+ /* Warm-up */
+ for (size_t i = 0; i < num_iters; i++)
+ sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len],
+ data_len, hash1, hash2);
+
+ for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) {
+ size_t salt_len = salt_lens_to_test[i];
+ u64 t0, t1;
+
+ /*
+ * Prepare the initial context. The time to process the salt is
+ * not measured; we're just interested in sha256_finup_2x().
+ */
+ sha256_init(&ctx);
+ sha256_update(&ctx, test_buf, salt_len);
+
+ preempt_disable();
+ t0 = ktime_get_ns();
+ for (size_t j = 0; j < num_iters; j++)
+ sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len],
+ data_len, hash1, hash2);
+ t1 = ktime_get_ns();
+ preempt_enable();
+ kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s",
+ data_len, salt_len,
+ div64_u64((u64)data_len * 2 * num_iters * 1000,
+ t1 - t0 ?: 1));
+ }
+}
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_sha256_finup_2x),
+ KUNIT_CASE(test_sha256_finup_2x_defaultctx),
+ KUNIT_CASE(test_sha256_finup_2x_hugelen),
+ KUNIT_CASE(benchmark_hash),
+ KUNIT_CASE(benchmark_sha256_finup_2x),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "sha256",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-256 and HMAC-SHA256");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha3-testvecs.h b/lib/crypto/tests/sha3-testvecs.h
new file mode 100644
index 000000000000..8d614a5fa0c3
--- /dev/null
+++ b/lib/crypto/tests/sha3-testvecs.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha3 */
+
+/* SHA3-256 test vectors */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA3_256_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xa7, 0xff, 0xc6, 0xf8, 0xbf, 0x1e, 0xd7, 0x66,
+ 0x51, 0xc1, 0x47, 0x56, 0xa0, 0x61, 0xd6, 0x62,
+ 0xf5, 0x80, 0xff, 0x4d, 0xe4, 0x3b, 0x49, 0xfa,
+ 0x82, 0xd8, 0x0a, 0x4b, 0x80, 0xf8, 0x43, 0x4a,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x11, 0x03, 0xe7, 0x84, 0x51, 0x50, 0x86, 0x35,
+ 0x71, 0x8a, 0x70, 0xe3, 0xc4, 0x26, 0x7b, 0x21,
+ 0x02, 0x13, 0xa0, 0x81, 0xe8, 0xe6, 0x14, 0x25,
+ 0x07, 0x34, 0xe5, 0xc5, 0x40, 0x06, 0xf2, 0x8b,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x2f, 0x6f, 0x6d, 0x47, 0x48, 0x52, 0x11, 0xb9,
+ 0xe4, 0x3d, 0xc8, 0x71, 0xcf, 0xb2, 0xee, 0xae,
+ 0x5b, 0xf4, 0x12, 0x84, 0x5b, 0x1c, 0xec, 0x6c,
+ 0xc1, 0x66, 0x88, 0xaa, 0xc3, 0x40, 0xbd, 0x7e,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0xec, 0x02, 0xe8, 0x81, 0x4f, 0x84, 0x41, 0x69,
+ 0x06, 0xd8, 0xdc, 0x1d, 0x01, 0x78, 0xd7, 0xcb,
+ 0x39, 0xdf, 0xd3, 0x12, 0x1c, 0x99, 0xfd, 0xf3,
+ 0x5c, 0x83, 0xc9, 0xc2, 0x7a, 0x7b, 0x6a, 0x05,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0xff, 0x6f, 0xc3, 0x41, 0xc3, 0x5f, 0x34, 0x6d,
+ 0xa7, 0xdf, 0x3e, 0xc2, 0x8b, 0x29, 0xb6, 0xf1,
+ 0xf8, 0x67, 0xfd, 0xcd, 0xb1, 0x9f, 0x38, 0x08,
+ 0x1d, 0x8d, 0xd9, 0xc2, 0x43, 0x66, 0x18, 0x6c,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0xe4, 0xb1, 0x06, 0x17, 0xf8, 0x8b, 0x91, 0x95,
+ 0xe7, 0x57, 0x66, 0xac, 0x08, 0xb2, 0x03, 0x3e,
+ 0xf7, 0x84, 0x1f, 0xe3, 0x25, 0xa3, 0x11, 0xd2,
+ 0x11, 0xa4, 0x78, 0x74, 0x2a, 0x43, 0x20, 0xa5,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0xeb, 0x57, 0x5f, 0x20, 0xa3, 0x6b, 0xc7, 0xb4,
+ 0x66, 0x2a, 0xa0, 0x30, 0x3b, 0x52, 0x00, 0xc9,
+ 0xce, 0x6a, 0xd8, 0x1e, 0xbe, 0xed, 0xa1, 0xd1,
+ 0xbe, 0x63, 0xc7, 0xe1, 0xe2, 0x66, 0x67, 0x0c,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0xf0, 0x67, 0xad, 0x66, 0xbe, 0xec, 0x5a, 0xfd,
+ 0x29, 0xd2, 0x4f, 0x1d, 0xb2, 0x24, 0xb8, 0x90,
+ 0x05, 0x28, 0x0e, 0x66, 0x67, 0x74, 0x2d, 0xee,
+ 0x66, 0x25, 0x11, 0xd1, 0x76, 0xa2, 0xfc, 0x3a,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x57, 0x56, 0x21, 0xb3, 0x2d, 0x2d, 0xe1, 0x9d,
+ 0xbf, 0x2c, 0x82, 0xa8, 0xad, 0x7e, 0x6c, 0x46,
+ 0xfb, 0x30, 0xeb, 0xce, 0xcf, 0xed, 0x2d, 0x65,
+ 0xe7, 0xe4, 0x96, 0x69, 0xe0, 0x48, 0xd2, 0xb6,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x7b, 0xba, 0x67, 0x15, 0xe5, 0x21, 0xc4, 0x69,
+ 0xd3, 0xef, 0x5c, 0x97, 0x9f, 0x5b, 0xba, 0x9c,
+ 0xfa, 0x55, 0x64, 0xec, 0xb5, 0x37, 0x53, 0x1b,
+ 0x3f, 0x4c, 0x0a, 0xed, 0x51, 0x98, 0x2b, 0x52,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x44, 0xb6, 0x6b, 0x83, 0x09, 0x83, 0x55, 0x83,
+ 0xde, 0x1f, 0xcc, 0x33, 0xef, 0xdc, 0x05, 0xbb,
+ 0x3b, 0x63, 0x76, 0x45, 0xe4, 0x8e, 0x14, 0x7a,
+ 0x2d, 0xae, 0x90, 0xce, 0x68, 0xc3, 0xa4, 0xf2,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x50, 0x3e, 0x99, 0x4e, 0x28, 0x2b, 0xc9, 0xf4,
+ 0xf5, 0xeb, 0x2b, 0x16, 0x04, 0x2d, 0xf5, 0xbe,
+ 0xc0, 0x91, 0x41, 0x2a, 0x8e, 0x69, 0x5e, 0x39,
+ 0x53, 0x2c, 0xc1, 0x18, 0xa5, 0xeb, 0xd8, 0xda,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x90, 0x0b, 0xa6, 0x92, 0x84, 0x30, 0xaf, 0xee,
+ 0x38, 0x59, 0x83, 0x83, 0xe9, 0xfe, 0xab, 0x86,
+ 0x79, 0x1b, 0xcd, 0xe7, 0x0a, 0x0f, 0x58, 0x53,
+ 0x36, 0xab, 0x12, 0xe1, 0x5c, 0x97, 0xc1, 0xfb,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x2b, 0x52, 0x1e, 0x54, 0xbe, 0x38, 0x4c, 0x3e,
+ 0x73, 0x37, 0x18, 0xf5, 0x25, 0x2c, 0xc8, 0xc7,
+ 0xda, 0x7e, 0xb6, 0x47, 0x9d, 0xf4, 0x46, 0xce,
+ 0xfa, 0x80, 0x20, 0x6b, 0xbd, 0xfd, 0x2a, 0xd8,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x45, 0xf0, 0xf5, 0x9b, 0xd9, 0x91, 0x26, 0xd5,
+ 0x91, 0x3b, 0xf8, 0x87, 0x8b, 0x34, 0x02, 0x31,
+ 0x64, 0xab, 0xf4, 0x1c, 0x6e, 0x34, 0x72, 0xdf,
+ 0x32, 0x6d, 0xe5, 0xd2, 0x67, 0x5e, 0x86, 0x93,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0xb3, 0xaf, 0x71, 0x64, 0xfa, 0xd4, 0xf1, 0x07,
+ 0x38, 0xef, 0x04, 0x8e, 0x89, 0xf4, 0x02, 0xd2,
+ 0xa5, 0xaf, 0x3b, 0xf5, 0x67, 0x56, 0xcf, 0xa9,
+ 0x8e, 0x43, 0xf5, 0xb5, 0xe3, 0x91, 0x8e, 0xe7,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x51, 0xac, 0x0a, 0x65, 0xb7, 0x96, 0x20, 0xcf,
+ 0x88, 0xf6, 0x97, 0x35, 0x89, 0x0d, 0x31, 0x0f,
+ 0xbe, 0x17, 0xbe, 0x62, 0x03, 0x67, 0xc0, 0xee,
+ 0x4f, 0xc1, 0xe3, 0x7f, 0x6f, 0xab, 0xac, 0xb4,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x7e, 0xea, 0xa8, 0xd7, 0xde, 0x20, 0x1b, 0x58,
+ 0x24, 0xd8, 0x26, 0x40, 0x36, 0x5f, 0x3f, 0xaa,
+ 0xe5, 0x5a, 0xea, 0x98, 0x58, 0xd4, 0xd6, 0xfc,
+ 0x20, 0x4c, 0x5c, 0x4f, 0xaf, 0x56, 0xc7, 0xc3,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x61, 0xb1, 0xb1, 0x3e, 0x0e, 0x7e, 0x90, 0x3d,
+ 0x31, 0x54, 0xbd, 0xc9, 0x0d, 0x53, 0x62, 0xf1,
+ 0xcd, 0x18, 0x80, 0xf9, 0x91, 0x75, 0x41, 0xb3,
+ 0x51, 0x39, 0x57, 0xa7, 0xa8, 0x1e, 0xfb, 0xc9,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0xab, 0x29, 0xda, 0x10, 0xc4, 0x11, 0x2d, 0x5c,
+ 0xd1, 0xce, 0x1c, 0x95, 0xfa, 0xc6, 0xc7, 0xb0,
+ 0x1b, 0xd1, 0xdc, 0x6f, 0xa0, 0x9d, 0x1b, 0x23,
+ 0xfb, 0x6e, 0x90, 0x97, 0xd0, 0x75, 0x44, 0x7a,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x02, 0x45, 0x95, 0xf4, 0x19, 0xb5, 0x93, 0x29,
+ 0x90, 0xf2, 0x63, 0x3f, 0x89, 0xe8, 0xa5, 0x31,
+ 0x76, 0xf2, 0x89, 0x79, 0x66, 0xd3, 0x96, 0xdf,
+ 0x33, 0xd1, 0xa6, 0x17, 0x73, 0xb1, 0xd0, 0x45,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0xd1, 0x8e, 0x22, 0xea, 0x44, 0x87, 0x6e, 0x9d,
+ 0xfb, 0x36, 0x02, 0x20, 0x63, 0xb7, 0x69, 0x45,
+ 0x25, 0x41, 0x69, 0xe0, 0x9b, 0x87, 0xcf, 0xa3,
+ 0x51, 0xbb, 0xfc, 0x8d, 0xf7, 0x29, 0xa7, 0xea,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x11, 0x86, 0x7d, 0x84, 0xf9, 0x8c, 0x6e, 0xc4,
+ 0x64, 0x36, 0xc6, 0xf3, 0x42, 0x92, 0x31, 0x2b,
+ 0x1e, 0x12, 0xe6, 0x4d, 0xbe, 0xfa, 0x77, 0x3f,
+ 0x89, 0x41, 0x33, 0x58, 0x1c, 0x98, 0x16, 0x0a,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xb2, 0xba, 0x0c, 0x8c, 0x9d, 0xbb, 0x1e, 0xb0,
+ 0x03, 0xb5, 0xdf, 0x4f, 0xf5, 0x35, 0xdb, 0xec,
+ 0x60, 0xf2, 0x5b, 0xb6, 0xd0, 0x49, 0xd3, 0xed,
+ 0x55, 0xc0, 0x7a, 0xd7, 0xaf, 0xa1, 0xea, 0x53,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+ 0x3b, 0x33, 0x67, 0xf8, 0xea, 0x92, 0x78, 0x62,
+ 0xdd, 0xbe, 0x72, 0x15, 0xbd, 0x6f, 0xfa, 0xe5,
+ 0x5e, 0xab, 0x9f, 0xb1, 0xe4, 0x23, 0x7c, 0x2c,
+ 0x80, 0xcf, 0x09, 0x75, 0xf8, 0xe2, 0xfa, 0x30,
+};
+
+/* SHAKE test vectors */
+
+static const u8 shake128_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+ 0x89, 0x88, 0x3a, 0x44, 0xec, 0xfe, 0x3c, 0xeb,
+ 0x2f, 0x1c, 0x1d, 0xda, 0x9e, 0x36, 0x64, 0xf0,
+ 0x85, 0x4c, 0x49, 0x12, 0x76, 0x5a, 0x4d, 0xe7,
+ 0xa8, 0xfd, 0xcd, 0xbe, 0x45, 0xb4, 0x6f, 0xb0,
+};
+
+static const u8 shake256_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+ 0x5a, 0xfd, 0x66, 0x62, 0x5c, 0x37, 0x2b, 0x41,
+ 0x77, 0x1c, 0x01, 0x5d, 0x64, 0x7c, 0x63, 0x7a,
+ 0x7c, 0x76, 0x9e, 0xa8, 0xd1, 0xb0, 0x8e, 0x02,
+ 0x16, 0x9b, 0xfe, 0x0e, 0xb5, 0xd8, 0x6a, 0xb5,
+};
diff --git a/lib/crypto/tests/sha384-testvecs.h b/lib/crypto/tests/sha384-testvecs.h
new file mode 100644
index 000000000000..6e2f572dd792
--- /dev/null
+++ b/lib/crypto/tests/sha384-testvecs.h
@@ -0,0 +1,290 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha384 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA384_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38,
+ 0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a,
+ 0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43,
+ 0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda,
+ 0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb,
+ 0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x07, 0x34, 0x9d, 0x74, 0x48, 0x76, 0xa5, 0x72,
+ 0x78, 0x02, 0xb8, 0x6e, 0x21, 0x59, 0xb0, 0x75,
+ 0x09, 0x68, 0x11, 0x39, 0x53, 0x61, 0xee, 0x8d,
+ 0xf2, 0x01, 0xf3, 0x90, 0x53, 0x7c, 0xd3, 0xde,
+ 0x13, 0x9f, 0xd2, 0x74, 0x28, 0xfe, 0xe1, 0xc8,
+ 0x2e, 0x95, 0xc6, 0x7d, 0x69, 0x4d, 0x04, 0xc6,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0xc4, 0xef, 0x6e, 0x8c, 0x19, 0x1c, 0xaa, 0x0e,
+ 0x86, 0xf2, 0x68, 0xa1, 0xa0, 0x2d, 0x2e, 0xb2,
+ 0x84, 0xbc, 0x5d, 0x53, 0x31, 0xf8, 0x03, 0x75,
+ 0x56, 0xf4, 0x8b, 0x23, 0x1a, 0x68, 0x15, 0x9a,
+ 0x60, 0xb2, 0xec, 0x05, 0xe1, 0xd4, 0x5e, 0x9e,
+ 0xe8, 0x7c, 0x9d, 0xe4, 0x0f, 0x9c, 0x3a, 0xdd,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x29, 0xd2, 0x02, 0xa2, 0x77, 0x24, 0xc7, 0xa7,
+ 0x23, 0x0c, 0x3e, 0x30, 0x56, 0x47, 0xdb, 0x75,
+ 0xd4, 0x41, 0xf8, 0xb3, 0x8e, 0x26, 0xf6, 0x92,
+ 0xbc, 0x20, 0x2e, 0x96, 0xcc, 0x81, 0x5f, 0x32,
+ 0x82, 0x60, 0xe2, 0xcf, 0x23, 0xd7, 0x3c, 0x90,
+ 0xb2, 0x56, 0x8f, 0xb6, 0x0f, 0xf0, 0x6b, 0x80,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x21, 0x4c, 0xac, 0xfe, 0xbd, 0x40, 0x74, 0x1f,
+ 0xa2, 0x2d, 0x2f, 0x35, 0x91, 0xfd, 0xc9, 0x97,
+ 0x88, 0x12, 0x6c, 0x0c, 0x6e, 0xd8, 0x50, 0x0b,
+ 0x4b, 0x2c, 0x89, 0xa6, 0xa6, 0x4a, 0xad, 0xd7,
+ 0x72, 0x62, 0x2c, 0x62, 0x81, 0xcd, 0x24, 0x74,
+ 0xf5, 0x44, 0x05, 0xa0, 0x97, 0xea, 0xf1, 0x78,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x06, 0x8b, 0x92, 0x9f, 0x8b, 0x64, 0xb2, 0x80,
+ 0xde, 0xcc, 0xde, 0xc3, 0x2f, 0x22, 0x27, 0xe8,
+ 0x3b, 0x6e, 0x16, 0x21, 0x14, 0x81, 0xbe, 0x5b,
+ 0xa7, 0xa7, 0x14, 0x8a, 0x00, 0x8f, 0x0d, 0x38,
+ 0x11, 0x63, 0xe8, 0x3e, 0xb9, 0xf1, 0xcf, 0x87,
+ 0xb1, 0x28, 0xe5, 0xa1, 0x89, 0xa8, 0x7a, 0xde,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x9e, 0x37, 0x76, 0x62, 0x98, 0x39, 0xbe, 0xfd,
+ 0x2b, 0x91, 0x20, 0x54, 0x8f, 0x21, 0xe7, 0x30,
+ 0x0a, 0x01, 0x7a, 0x65, 0x0b, 0xc9, 0xb3, 0x89,
+ 0x3c, 0xb6, 0xd3, 0xa8, 0xff, 0xc9, 0x1b, 0x5c,
+ 0xd4, 0xac, 0xb4, 0x7e, 0xba, 0x94, 0xc3, 0x8a,
+ 0x26, 0x41, 0xf6, 0xd5, 0xed, 0x6f, 0x27, 0xa7,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x03, 0x1f, 0xef, 0x5a, 0x16, 0x28, 0x78, 0x10,
+ 0x29, 0xe8, 0xe2, 0xe4, 0x84, 0x36, 0x19, 0x10,
+ 0xaa, 0xea, 0xde, 0x06, 0x39, 0x5f, 0xb2, 0x36,
+ 0xca, 0x24, 0x4f, 0x7b, 0x66, 0xf7, 0xe7, 0x31,
+ 0xf3, 0x9b, 0x74, 0x1e, 0x17, 0x20, 0x88, 0x62,
+ 0x50, 0xeb, 0x5f, 0x9a, 0xa7, 0x2c, 0xf4, 0xc9,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x10, 0xce, 0xed, 0x26, 0xb8, 0xac, 0xc1, 0x1b,
+ 0xe6, 0xb9, 0xeb, 0x7c, 0xae, 0xcd, 0x55, 0x5a,
+ 0x20, 0x2a, 0x7b, 0x43, 0xe6, 0x3e, 0xf0, 0x3f,
+ 0xd9, 0x2f, 0x8c, 0x52, 0xe2, 0xf0, 0xb6, 0x24,
+ 0x2e, 0xa4, 0xac, 0x24, 0x3a, 0x54, 0x99, 0x71,
+ 0x65, 0xab, 0x97, 0x2d, 0xb6, 0xe6, 0x94, 0x20,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x24, 0x6d, 0x9f, 0x59, 0x42, 0x36, 0xca, 0x34,
+ 0x36, 0x41, 0xa2, 0xcd, 0x69, 0xdf, 0x3d, 0xcb,
+ 0x64, 0x94, 0x54, 0xb2, 0xed, 0xc1, 0x1c, 0x31,
+ 0xe3, 0x26, 0xcb, 0x71, 0xe6, 0x98, 0xb2, 0x56,
+ 0x74, 0x30, 0xa9, 0x15, 0x98, 0x9d, 0xb3, 0x07,
+ 0xcc, 0xa8, 0xcc, 0x6f, 0x42, 0xb0, 0x9d, 0x2b,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x85, 0x1f, 0xbc, 0x5e, 0x2a, 0x00, 0x7d, 0xc2,
+ 0x21, 0x4c, 0x28, 0x14, 0xc5, 0xd8, 0x0c, 0xe8,
+ 0x55, 0xa5, 0xa0, 0x77, 0xda, 0x8f, 0xce, 0xd4,
+ 0xf0, 0xcb, 0x30, 0xb8, 0x9c, 0x47, 0xe1, 0x33,
+ 0x92, 0x18, 0xc5, 0x1f, 0xf2, 0xef, 0xb5, 0xe5,
+ 0xbc, 0x63, 0xa6, 0xe5, 0x9a, 0xc9, 0xcc, 0xf1,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x26, 0xd2, 0x4c, 0xb6, 0xce, 0xd8, 0x22, 0x2b,
+ 0x44, 0x10, 0x6f, 0x59, 0xf7, 0x0d, 0xb9, 0x3f,
+ 0x7d, 0x29, 0x75, 0xf1, 0x71, 0xb2, 0x71, 0x23,
+ 0xef, 0x68, 0xb7, 0x25, 0xae, 0xb8, 0x45, 0xf8,
+ 0xa3, 0xb2, 0x2d, 0x7a, 0x83, 0x0a, 0x05, 0x61,
+ 0xbc, 0x73, 0xf1, 0xf9, 0xba, 0xfb, 0x3d, 0xc2,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x7c, 0xe5, 0x7f, 0x5e, 0xea, 0xd9, 0x7e, 0x54,
+ 0x14, 0x30, 0x6f, 0x37, 0x02, 0x71, 0x0f, 0xf1,
+ 0x14, 0x16, 0xfa, 0xeb, 0x6e, 0x1e, 0xf0, 0xbe,
+ 0x10, 0xed, 0x01, 0xbf, 0xa0, 0x9d, 0xcb, 0x07,
+ 0x5f, 0x8b, 0x7f, 0x44, 0xe1, 0xd9, 0x13, 0xf0,
+ 0x29, 0xa2, 0x54, 0x32, 0xd9, 0xb0, 0x69, 0x69,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0xc5, 0x54, 0x1f, 0xcb, 0x9d, 0x8f, 0xdf, 0xbf,
+ 0xab, 0x55, 0x92, 0x1d, 0x3b, 0x93, 0x79, 0x26,
+ 0xdf, 0xba, 0x9a, 0x28, 0xff, 0xa0, 0x6c, 0xae,
+ 0x7b, 0x53, 0x8d, 0xfa, 0xef, 0x35, 0x88, 0x19,
+ 0x16, 0xb8, 0x72, 0x86, 0x76, 0x2a, 0xf5, 0xe6,
+ 0xec, 0xb2, 0xd7, 0xd4, 0xbe, 0x1a, 0xe4, 0x9f,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x74, 0x9d, 0x77, 0xfb, 0xe8, 0x0f, 0x0c, 0x2d,
+ 0x86, 0x0d, 0x49, 0xea, 0x2b, 0xd0, 0x13, 0xd1,
+ 0xe8, 0xb8, 0xe1, 0xa3, 0x7b, 0x48, 0xab, 0x6a,
+ 0x21, 0x2b, 0x4c, 0x48, 0x32, 0xb5, 0xdc, 0x31,
+ 0x7f, 0xd0, 0x32, 0x67, 0x9a, 0xc0, 0x85, 0x53,
+ 0xef, 0xe9, 0xfb, 0xe1, 0x8b, 0xd8, 0xcc, 0xc2,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x7b, 0xa9, 0xde, 0xa3, 0x07, 0x5c, 0x4c, 0xaa,
+ 0x31, 0xc6, 0x9e, 0x55, 0xd4, 0x3f, 0x52, 0xdd,
+ 0xde, 0x36, 0x70, 0x96, 0x59, 0x6e, 0x90, 0x78,
+ 0x4c, 0x6a, 0x27, 0xde, 0x83, 0x84, 0xc3, 0x35,
+ 0x53, 0x76, 0x1d, 0xbf, 0x83, 0x64, 0xcf, 0xf2,
+ 0xb0, 0x3e, 0x07, 0x27, 0xe4, 0x25, 0x6c, 0x56,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x53, 0x50, 0xf7, 0x3b, 0x86, 0x1d, 0x7a, 0xe2,
+ 0x5d, 0x9b, 0x71, 0xfa, 0x25, 0x23, 0x5a, 0xfe,
+ 0x8c, 0xb9, 0xac, 0x8a, 0x9d, 0x6c, 0x99, 0xbc,
+ 0x01, 0x9e, 0xa0, 0xd6, 0x3c, 0x03, 0x46, 0x21,
+ 0xb6, 0xd0, 0xb0, 0xb3, 0x23, 0x23, 0x58, 0xf1,
+ 0xea, 0x4e, 0xf2, 0x1a, 0x2f, 0x14, 0x2b, 0x5a,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x06, 0x03, 0xb3, 0xba, 0x14, 0xe0, 0x28, 0x07,
+ 0xd5, 0x15, 0x97, 0x1f, 0x87, 0xef, 0x80, 0xba,
+ 0x48, 0x03, 0xb6, 0xc5, 0x47, 0xca, 0x8c, 0x95,
+ 0xed, 0x95, 0xfd, 0x27, 0xb6, 0x83, 0xda, 0x6d,
+ 0xa7, 0xb2, 0x1a, 0xd2, 0xb5, 0x89, 0xbb, 0xb4,
+ 0x00, 0xbc, 0x86, 0x54, 0x7d, 0x5a, 0x91, 0x63,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0xd3, 0xe0, 0x6e, 0x7d, 0x80, 0x08, 0x53, 0x07,
+ 0x8c, 0x0f, 0xc2, 0xce, 0x9f, 0x09, 0x86, 0x31,
+ 0x28, 0x24, 0x3c, 0x3e, 0x2d, 0x36, 0xb4, 0x28,
+ 0xc7, 0x1b, 0x70, 0xf9, 0x35, 0x9b, 0x10, 0xfa,
+ 0xc8, 0x5e, 0x2b, 0x32, 0x7f, 0x65, 0xd2, 0x68,
+ 0xb2, 0x84, 0x90, 0xf6, 0xc8, 0x6e, 0xb8, 0xdb,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x39, 0xeb, 0xc4, 0xb3, 0x08, 0xe2, 0xdd, 0xf3,
+ 0x9f, 0x5e, 0x44, 0x93, 0x63, 0x8b, 0x39, 0x57,
+ 0xd7, 0xe8, 0x7e, 0x3d, 0x74, 0xf8, 0xf6, 0xab,
+ 0xfe, 0x74, 0x51, 0xe4, 0x1b, 0x4a, 0x23, 0xbc,
+ 0x69, 0xfc, 0xbb, 0xa7, 0x71, 0xa7, 0x86, 0x24,
+ 0xcc, 0x85, 0x70, 0xf2, 0x31, 0x0d, 0x47, 0xc0,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x23, 0xc3, 0x97, 0x06, 0x79, 0xbe, 0x8a, 0xe9,
+ 0x1f, 0x1a, 0x43, 0xad, 0xe6, 0x76, 0x23, 0x13,
+ 0x64, 0xae, 0xda, 0xe7, 0x8b, 0x88, 0x96, 0xb6,
+ 0xa9, 0x1a, 0xb7, 0x80, 0x8e, 0x1c, 0x94, 0x98,
+ 0x09, 0x08, 0xdb, 0x8e, 0x4d, 0x0a, 0x09, 0x65,
+ 0xe5, 0x21, 0x1c, 0xd9, 0xab, 0x64, 0xbb, 0xea,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x4f, 0x4a, 0x88, 0x9f, 0x40, 0x89, 0xfe, 0xb6,
+ 0xda, 0x9d, 0xcd, 0xa5, 0x27, 0xd2, 0x29, 0x71,
+ 0x58, 0x60, 0xd4, 0x55, 0xfe, 0x92, 0xcd, 0x51,
+ 0x8b, 0xec, 0x3b, 0xd3, 0xd1, 0x3e, 0x8d, 0x36,
+ 0x7b, 0xb1, 0x41, 0xef, 0xec, 0x9d, 0xdf, 0xcd,
+ 0x4e, 0xde, 0x5a, 0xe5, 0xe5, 0x16, 0x14, 0x54,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0xb5, 0xa5, 0x3e, 0x86, 0x39, 0x20, 0x49, 0x4c,
+ 0xcd, 0xb6, 0xdd, 0x03, 0xfe, 0x36, 0x6e, 0xa6,
+ 0xfc, 0xff, 0x19, 0x33, 0x0c, 0x52, 0xea, 0x37,
+ 0x94, 0xda, 0x5b, 0x27, 0xd1, 0x99, 0x5a, 0x89,
+ 0x40, 0x78, 0xfa, 0x96, 0xb9, 0x2f, 0xa0, 0x48,
+ 0xc9, 0xf8, 0x5c, 0xf0, 0x95, 0xf4, 0xea, 0x61,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0x6f, 0x48, 0x6f, 0x21, 0xb9, 0xc1, 0xcc, 0x92,
+ 0x4e, 0xed, 0x6b, 0xef, 0x51, 0x88, 0xdf, 0xfd,
+ 0xcb, 0x3d, 0x44, 0x9c, 0x37, 0x85, 0xb4, 0xc5,
+ 0xeb, 0x60, 0x55, 0x58, 0x01, 0x47, 0xbf, 0x75,
+ 0x9b, 0xa8, 0x82, 0x8c, 0xec, 0xe8, 0x0e, 0x58,
+ 0xc1, 0x26, 0xa2, 0x45, 0x87, 0x3e, 0xfb, 0x8d,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA384_DIGEST_SIZE] = {
+ 0xfc, 0xcb, 0xe6, 0x42, 0xf0, 0x9e, 0x2b, 0x77,
+ 0x7b, 0x62, 0xe8, 0x70, 0x86, 0xbf, 0xaf, 0x3c,
+ 0xbb, 0x02, 0xd9, 0x86, 0xdc, 0xba, 0xd3, 0xa4,
+ 0x0d, 0x8d, 0xb3, 0x2d, 0x0b, 0xa3, 0x84, 0x04,
+ 0x7c, 0x16, 0x37, 0xaf, 0xba, 0x1e, 0xd4, 0x2f,
+ 0x4c, 0x57, 0x55, 0x86, 0x52, 0x47, 0x9a, 0xec,
+};
+
+static const u8 hmac_testvec_consolidated[SHA384_DIGEST_SIZE] = {
+ 0x82, 0xcf, 0x7d, 0x80, 0x71, 0xdb, 0x91, 0x09,
+ 0x67, 0xe8, 0x44, 0x4a, 0x0d, 0x03, 0xb1, 0xf9,
+ 0x62, 0xde, 0x4e, 0xbb, 0x1f, 0x41, 0xcd, 0x62,
+ 0x39, 0x6b, 0x2d, 0x44, 0x0e, 0xde, 0x98, 0x73,
+ 0xdd, 0xeb, 0x9d, 0x53, 0xfb, 0xee, 0xd1, 0xc3,
+ 0x96, 0xdb, 0xfc, 0x2a, 0x38, 0x90, 0x02, 0x53,
+};
diff --git a/lib/crypto/tests/sha384_kunit.c b/lib/crypto/tests/sha384_kunit.c
new file mode 100644
index 000000000000..e1ef5c995bb6
--- /dev/null
+++ b/lib/crypto/tests/sha384_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha384-testvecs.h"
+
+#define HASH sha384
+#define HASH_CTX sha384_ctx
+#define HASH_SIZE SHA384_DIGEST_SIZE
+#define HASH_INIT sha384_init
+#define HASH_UPDATE sha384_update
+#define HASH_FINAL sha384_final
+#define HMAC_KEY hmac_sha384_key
+#define HMAC_CTX hmac_sha384_ctx
+#define HMAC_PREPAREKEY hmac_sha384_preparekey
+#define HMAC_INIT hmac_sha384_init
+#define HMAC_UPDATE hmac_sha384_update
+#define HMAC_FINAL hmac_sha384_final
+#define HMAC hmac_sha384
+#define HMAC_USINGRAWKEY hmac_sha384_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "sha384",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-384 and HMAC-SHA384");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha3_kunit.c b/lib/crypto/tests/sha3_kunit.c
new file mode 100644
index 000000000000..ed5fbe80337f
--- /dev/null
+++ b/lib/crypto/tests/sha3_kunit.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#include <crypto/sha3.h>
+#include "sha3-testvecs.h"
+
+#define HASH sha3_256
+#define HASH_CTX sha3_ctx
+#define HASH_SIZE SHA3_256_DIGEST_SIZE
+#define HASH_INIT sha3_256_init
+#define HASH_UPDATE sha3_update
+#define HASH_FINAL sha3_final
+#include "hash-test-template.h"
+
+/*
+ * Sample message and the output generated for various algorithms by passing it
+ * into "openssl sha3-224" etc..
+ */
+static const u8 test_sha3_sample[] =
+ "The quick red fox jumped over the lazy brown dog!\n"
+ "The quick red fox jumped over the lazy brown dog!\n"
+ "The quick red fox jumped over the lazy brown dog!\n"
+ "The quick red fox jumped over the lazy brown dog!\n";
+
+static const u8 test_sha3_224[8 + SHA3_224_DIGEST_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0xd6, 0xe8, 0xd8, 0x80, 0xfa, 0x42, 0x80, 0x70,
+ 0x7e, 0x7f, 0xd7, 0xd2, 0xd7, 0x7a, 0x35, 0x65,
+ 0xf0, 0x0b, 0x4f, 0x9f, 0x2a, 0x33, 0xca, 0x0a,
+ 0xef, 0xa6, 0x4c, 0xb8,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_256[8 + SHA3_256_DIGEST_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0xdb, 0x3b, 0xb0, 0xb8, 0x8d, 0x15, 0x78, 0xe5,
+ 0x78, 0x76, 0x8e, 0x39, 0x7e, 0x89, 0x86, 0xb9,
+ 0x14, 0x3a, 0x1e, 0xe7, 0x96, 0x7c, 0xf3, 0x25,
+ 0x70, 0xbd, 0xc3, 0xa9, 0xae, 0x63, 0x71, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_384[8 + SHA3_384_DIGEST_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0x2d, 0x4b, 0x29, 0x85, 0x19, 0x94, 0xaa, 0x31,
+ 0x9b, 0x04, 0x9d, 0x6e, 0x79, 0x66, 0xc7, 0x56,
+ 0x8a, 0x2e, 0x99, 0x84, 0x06, 0xcf, 0x10, 0x2d,
+ 0xec, 0xf0, 0x03, 0x04, 0x1f, 0xd5, 0x99, 0x63,
+ 0x2f, 0xc3, 0x2b, 0x0d, 0xd9, 0x45, 0xf7, 0xbb,
+ 0x0a, 0xc3, 0x46, 0xab, 0xfe, 0x4d, 0x94, 0xc2,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_512[8 + SHA3_512_DIGEST_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0xdd, 0x71, 0x3b, 0x44, 0xb6, 0x6c, 0xd7, 0x78,
+ 0xe7, 0x93, 0xa1, 0x4c, 0xd7, 0x24, 0x16, 0xf1,
+ 0xfd, 0xa2, 0x82, 0x4e, 0xed, 0x59, 0xe9, 0x83,
+ 0x15, 0x38, 0x89, 0x7d, 0x39, 0x17, 0x0c, 0xb2,
+ 0xcf, 0x12, 0x80, 0x78, 0xa1, 0x78, 0x41, 0xeb,
+ 0xed, 0x21, 0x4c, 0xa4, 0x4a, 0x5f, 0x30, 0x1a,
+ 0x70, 0x98, 0x4f, 0x14, 0xa2, 0xd1, 0x64, 0x1b,
+ 0xc2, 0x0a, 0xff, 0x3b, 0xe8, 0x26, 0x41, 0x8f,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_shake128[8 + SHAKE128_DEFAULT_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0x41, 0xd6, 0xb8, 0x9c, 0xf8, 0xe8, 0x54, 0xf2,
+ 0x5c, 0xde, 0x51, 0x12, 0xaf, 0x9e, 0x0d, 0x91,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_shake256[8 + SHAKE256_DEFAULT_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0xab, 0x06, 0xd4, 0xf9, 0x8b, 0xfd, 0xb2, 0xc4,
+ 0xfe, 0xf1, 0xcc, 0xe2, 0x40, 0x45, 0xdd, 0x15,
+ 0xcb, 0xdd, 0x02, 0x8d, 0xb7, 0x9f, 0x1e, 0x67,
+ 0xd6, 0x7f, 0x98, 0x5e, 0x1b, 0x19, 0xf8, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static void test_sha3_224_basic(struct kunit *test)
+{
+ u8 out[8 + SHA3_224_DIGEST_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_224));
+
+ memset(out, 0, sizeof(out));
+ sha3_224(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_224, sizeof(test_sha3_224),
+ "SHA3-224 gives wrong output");
+}
+
+static void test_sha3_256_basic(struct kunit *test)
+{
+ u8 out[8 + SHA3_256_DIGEST_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_256));
+
+ memset(out, 0, sizeof(out));
+ sha3_256(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_256, sizeof(test_sha3_256),
+ "SHA3-256 gives wrong output");
+}
+
+static void test_sha3_384_basic(struct kunit *test)
+{
+ u8 out[8 + SHA3_384_DIGEST_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_384));
+
+ memset(out, 0, sizeof(out));
+ sha3_384(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_384, sizeof(test_sha3_384),
+ "SHA3-384 gives wrong output");
+}
+
+static void test_sha3_512_basic(struct kunit *test)
+{
+ u8 out[8 + SHA3_512_DIGEST_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_512));
+
+ memset(out, 0, sizeof(out));
+ sha3_512(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_512, sizeof(test_sha3_512),
+ "SHA3-512 gives wrong output");
+}
+
+static void test_shake128_basic(struct kunit *test)
+{
+ u8 out[8 + SHAKE128_DEFAULT_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_shake128));
+
+ memset(out, 0, sizeof(out));
+ shake128(test_sha3_sample, sizeof(test_sha3_sample) - 1,
+ out + 8, sizeof(out) - 16);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128, sizeof(test_shake128),
+ "SHAKE128 gives wrong output");
+}
+
+static void test_shake256_basic(struct kunit *test)
+{
+ u8 out[8 + SHAKE256_DEFAULT_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_shake256));
+
+ memset(out, 0, sizeof(out));
+ shake256(test_sha3_sample, sizeof(test_sha3_sample) - 1,
+ out + 8, sizeof(out) - 16);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256, sizeof(test_shake256),
+ "SHAKE256 gives wrong output");
+}
+
+/*
+ * Usable NIST tests.
+ *
+ * From: https://csrc.nist.gov/projects/cryptographic-standards-and-guidelines/example-values
+ */
+static const u8 test_nist_1600_sample[] = {
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3
+};
+
+static const u8 test_shake128_nist_0[] = {
+ 0x7f, 0x9c, 0x2b, 0xa4, 0xe8, 0x8f, 0x82, 0x7d,
+ 0x61, 0x60, 0x45, 0x50, 0x76, 0x05, 0x85, 0x3e
+};
+
+static const u8 test_shake128_nist_1600[] = {
+ 0x13, 0x1a, 0xb8, 0xd2, 0xb5, 0x94, 0x94, 0x6b,
+ 0x9c, 0x81, 0x33, 0x3f, 0x9b, 0xb6, 0xe0, 0xce,
+};
+
+static const u8 test_shake256_nist_0[] = {
+ 0x46, 0xb9, 0xdd, 0x2b, 0x0b, 0xa8, 0x8d, 0x13,
+ 0x23, 0x3b, 0x3f, 0xeb, 0x74, 0x3e, 0xeb, 0x24,
+ 0x3f, 0xcd, 0x52, 0xea, 0x62, 0xb8, 0x1b, 0x82,
+ 0xb5, 0x0c, 0x27, 0x64, 0x6e, 0xd5, 0x76, 0x2f
+};
+
+static const u8 test_shake256_nist_1600[] = {
+ 0xcd, 0x8a, 0x92, 0x0e, 0xd1, 0x41, 0xaa, 0x04,
+ 0x07, 0xa2, 0x2d, 0x59, 0x28, 0x86, 0x52, 0xe9,
+ 0xd9, 0xf1, 0xa7, 0xee, 0x0c, 0x1e, 0x7c, 0x1c,
+ 0xa6, 0x99, 0x42, 0x4d, 0xa8, 0x4a, 0x90, 0x4d,
+};
+
+static void test_shake128_nist(struct kunit *test)
+{
+ u8 out[SHAKE128_DEFAULT_SIZE];
+
+ shake128("", 0, out, sizeof(out));
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128_nist_0, sizeof(out),
+ "SHAKE128 gives wrong output for NIST.0");
+
+ shake128(test_nist_1600_sample, sizeof(test_nist_1600_sample),
+ out, sizeof(out));
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128_nist_1600, sizeof(out),
+ "SHAKE128 gives wrong output for NIST.1600");
+}
+
+static void test_shake256_nist(struct kunit *test)
+{
+ u8 out[SHAKE256_DEFAULT_SIZE];
+
+ shake256("", 0, out, sizeof(out));
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256_nist_0, sizeof(out),
+ "SHAKE256 gives wrong output for NIST.0");
+
+ shake256(test_nist_1600_sample, sizeof(test_nist_1600_sample),
+ out, sizeof(out));
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256_nist_1600, sizeof(out),
+ "SHAKE256 gives wrong output for NIST.1600");
+}
+
+static void shake(int alg, const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+ if (alg == 0)
+ shake128(in, in_len, out, out_len);
+ else
+ shake256(in, in_len, out, out_len);
+}
+
+static void shake_init(struct shake_ctx *ctx, int alg)
+{
+ if (alg == 0)
+ shake128_init(ctx);
+ else
+ shake256_init(ctx);
+}
+
+/*
+ * Test each of SHAKE128 and SHAKE256 with all input lengths 0 through 4096, for
+ * both input and output. The input and output lengths cycle through the values
+ * together, so we do 4096 tests total. To verify all the SHAKE outputs,
+ * compute and verify the SHA3-256 digest of all of them concatenated together.
+ */
+static void test_shake_all_lens_up_to_4096(struct kunit *test)
+{
+ struct sha3_ctx main_ctx;
+ const size_t max_len = 4096;
+ u8 *const in = test_buf;
+ u8 *const out = &test_buf[TEST_BUF_LEN - max_len];
+ u8 main_hash[SHA3_256_DIGEST_SIZE];
+
+ KUNIT_ASSERT_LE(test, 2 * max_len, TEST_BUF_LEN);
+
+ rand_bytes_seeded_from_len(in, max_len);
+ for (int alg = 0; alg < 2; alg++) {
+ sha3_256_init(&main_ctx);
+ for (size_t in_len = 0; in_len <= max_len; in_len++) {
+ size_t out_len = (in_len * 293) % (max_len + 1);
+
+ shake(alg, in, in_len, out, out_len);
+ sha3_update(&main_ctx, out, out_len);
+ }
+ sha3_final(&main_ctx, main_hash);
+ if (alg == 0)
+ KUNIT_ASSERT_MEMEQ_MSG(test, main_hash,
+ shake128_testvec_consolidated,
+ sizeof(main_hash),
+ "shake128() gives wrong output");
+ else
+ KUNIT_ASSERT_MEMEQ_MSG(test, main_hash,
+ shake256_testvec_consolidated,
+ sizeof(main_hash),
+ "shake256() gives wrong output");
+ }
+}
+
+/*
+ * Test that a sequence of SHAKE squeezes gives the same output as a single
+ * squeeze of the same total length.
+ */
+static void test_shake_multiple_squeezes(struct kunit *test)
+{
+ const size_t max_len = 512;
+ u8 *ref_out;
+
+ KUNIT_ASSERT_GE(test, TEST_BUF_LEN, 2 * max_len);
+
+ ref_out = kunit_kzalloc(test, max_len, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ref_out);
+
+ for (int i = 0; i < 2000; i++) {
+ const int alg = rand32() % 2;
+ const size_t in_len = rand_length(max_len);
+ const size_t out_len = rand_length(max_len);
+ const size_t in_offs = rand_offset(max_len - in_len);
+ const size_t out_offs = rand_offset(max_len - out_len);
+ u8 *const in = &test_buf[in_offs];
+ u8 *const out = &test_buf[out_offs];
+ struct shake_ctx ctx;
+ size_t remaining_len, j, num_parts;
+
+ rand_bytes(in, in_len);
+ rand_bytes(out, out_len);
+
+ /* Compute the output using the one-shot function. */
+ shake(alg, in, in_len, ref_out, out_len);
+
+ /* Compute the output using a random sequence of squeezes. */
+ shake_init(&ctx, alg);
+ shake_update(&ctx, in, in_len);
+ remaining_len = out_len;
+ j = 0;
+ num_parts = 0;
+ while (rand_bool()) {
+ size_t part_len = rand_length(remaining_len);
+
+ shake_squeeze(&ctx, &out[j], part_len);
+ num_parts++;
+ j += part_len;
+ remaining_len -= part_len;
+ }
+ if (remaining_len != 0 || rand_bool()) {
+ shake_squeeze(&ctx, &out[j], remaining_len);
+ num_parts++;
+ }
+
+ /* Verify that the outputs are the same. */
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, out, ref_out, out_len,
+ "Multi-squeeze test failed with in_len=%zu in_offs=%zu out_len=%zu out_offs=%zu num_parts=%zu alg=%d",
+ in_len, in_offs, out_len, out_offs, num_parts, alg);
+ }
+}
+
+/*
+ * Test that SHAKE operations on buffers immediately followed by an unmapped
+ * page work as expected. This catches out-of-bounds memory accesses even if
+ * they occur in assembly code.
+ */
+static void test_shake_with_guarded_bufs(struct kunit *test)
+{
+ const size_t max_len = 512;
+ u8 *reg_buf;
+
+ KUNIT_ASSERT_GE(test, TEST_BUF_LEN, max_len);
+
+ reg_buf = kunit_kzalloc(test, max_len, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, reg_buf);
+
+ for (int alg = 0; alg < 2; alg++) {
+ for (size_t len = 0; len <= max_len; len++) {
+ u8 *guarded_buf = &test_buf[TEST_BUF_LEN - len];
+
+ rand_bytes(reg_buf, len);
+ memcpy(guarded_buf, reg_buf, len);
+
+ shake(alg, reg_buf, len, reg_buf, len);
+ shake(alg, guarded_buf, len, guarded_buf, len);
+
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, reg_buf, guarded_buf, len,
+ "Guard page test failed with len=%zu alg=%d",
+ len, alg);
+ }
+ }
+}
+
+static struct kunit_case sha3_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_sha3_224_basic),
+ KUNIT_CASE(test_sha3_256_basic),
+ KUNIT_CASE(test_sha3_384_basic),
+ KUNIT_CASE(test_sha3_512_basic),
+ KUNIT_CASE(test_shake128_basic),
+ KUNIT_CASE(test_shake256_basic),
+ KUNIT_CASE(test_shake128_nist),
+ KUNIT_CASE(test_shake256_nist),
+ KUNIT_CASE(test_shake_all_lens_up_to_4096),
+ KUNIT_CASE(test_shake_multiple_squeezes),
+ KUNIT_CASE(test_shake_with_guarded_bufs),
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite sha3_test_suite = {
+ .name = "sha3",
+ .test_cases = sha3_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(sha3_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA3");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha512-testvecs.h b/lib/crypto/tests/sha512-testvecs.h
new file mode 100644
index 000000000000..c506aaf72ba7
--- /dev/null
+++ b/lib/crypto/tests/sha512-testvecs.h
@@ -0,0 +1,342 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha512 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA512_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd,
+ 0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07,
+ 0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc,
+ 0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce,
+ 0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0,
+ 0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f,
+ 0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81,
+ 0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x12, 0xf2, 0xb6, 0xec, 0x84, 0xa0, 0x8e, 0xcf,
+ 0x0d, 0xec, 0x17, 0xfd, 0x1f, 0x91, 0x15, 0x69,
+ 0xd2, 0xb9, 0x89, 0xff, 0x89, 0x9d, 0xd9, 0x0b,
+ 0x7a, 0x0f, 0x82, 0x94, 0x57, 0x5b, 0xf3, 0x08,
+ 0x42, 0x45, 0x23, 0x08, 0x44, 0x54, 0x35, 0x36,
+ 0xed, 0x4b, 0xb3, 0xa5, 0xbf, 0x17, 0xc1, 0x3c,
+ 0xdd, 0x25, 0x4a, 0x30, 0x64, 0xed, 0x66, 0x06,
+ 0x72, 0x05, 0xc2, 0x71, 0x5a, 0x6c, 0xd0, 0x75,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x01, 0x37, 0x97, 0xcc, 0x0a, 0xcb, 0x61, 0xa4,
+ 0x93, 0x26, 0x36, 0x4b, 0xd2, 0x27, 0xea, 0xaf,
+ 0xda, 0xfa, 0x8f, 0x86, 0x12, 0x99, 0x7b, 0xc8,
+ 0x94, 0xa9, 0x1c, 0x70, 0x3f, 0x43, 0xf3, 0x9a,
+ 0x02, 0xc5, 0x0d, 0x8e, 0x01, 0xf8, 0x3a, 0xa6,
+ 0xec, 0xaa, 0xc5, 0xc7, 0x9d, 0x3d, 0x7f, 0x9d,
+ 0x47, 0x0e, 0x58, 0x2d, 0x9a, 0x2d, 0x51, 0x1d,
+ 0xc3, 0x77, 0xb2, 0x7f, 0x69, 0x9a, 0xc3, 0x50,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0xd4, 0xa3, 0xc2, 0x50, 0xa0, 0x33, 0xc6, 0xe4,
+ 0x50, 0x08, 0xea, 0xc9, 0xb8, 0x35, 0x55, 0x34,
+ 0x61, 0xb8, 0x2e, 0xa2, 0xe5, 0xdc, 0x04, 0x70,
+ 0x99, 0x86, 0x5f, 0xee, 0x2e, 0x1e, 0xd4, 0x40,
+ 0xf8, 0x88, 0x01, 0x84, 0x97, 0x16, 0x6a, 0xd3,
+ 0x0a, 0xb4, 0xe2, 0x34, 0xca, 0x1f, 0xfc, 0x6b,
+ 0x61, 0xf3, 0x7f, 0x72, 0xfa, 0x8d, 0x22, 0xcf,
+ 0x7f, 0x2d, 0x87, 0x9d, 0xbb, 0x59, 0xac, 0x53,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x3b, 0xae, 0x66, 0x48, 0x0f, 0x35, 0x3c, 0xcd,
+ 0x92, 0xa9, 0xb6, 0xb9, 0xfe, 0x19, 0x90, 0x92,
+ 0x52, 0xa5, 0x02, 0xd1, 0x89, 0xcf, 0x98, 0x86,
+ 0x29, 0x28, 0xab, 0xc4, 0x9e, 0xcc, 0x75, 0x38,
+ 0x95, 0xa7, 0x59, 0x43, 0xef, 0x8c, 0x3a, 0xeb,
+ 0x40, 0xa3, 0xbe, 0x2b, 0x75, 0x0d, 0xfd, 0xc3,
+ 0xaf, 0x69, 0x08, 0xad, 0x9f, 0xc9, 0xf4, 0x96,
+ 0xa9, 0xc2, 0x2b, 0x1b, 0x66, 0x6f, 0x1d, 0x28,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x9c, 0xfb, 0x3c, 0x40, 0xd5, 0x3b, 0xc4, 0xff,
+ 0x07, 0xa7, 0xf0, 0x24, 0xb7, 0xd6, 0x5e, 0x12,
+ 0x5b, 0x85, 0xb5, 0xa5, 0xe0, 0x82, 0xa6, 0xda,
+ 0x30, 0x13, 0x2f, 0x1a, 0xe3, 0xd0, 0x55, 0xcb,
+ 0x14, 0x19, 0xe2, 0x09, 0x91, 0x96, 0x26, 0xf9,
+ 0x38, 0xd7, 0xfa, 0x4a, 0xfb, 0x2f, 0x6f, 0xc0,
+ 0xf4, 0x95, 0xc3, 0x40, 0xf6, 0xdb, 0xe7, 0xc2,
+ 0x79, 0x23, 0xa4, 0x20, 0x96, 0x3a, 0x00, 0xbb,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x92, 0x1a, 0x21, 0x06, 0x6e, 0x08, 0x84, 0x09,
+ 0x23, 0x8d, 0x63, 0xec, 0xd6, 0x72, 0xd3, 0x21,
+ 0x51, 0xe8, 0x65, 0x94, 0xf8, 0x1f, 0x5f, 0xa7,
+ 0xab, 0x6b, 0xae, 0x1c, 0x2c, 0xaf, 0xf9, 0x0c,
+ 0x7c, 0x5a, 0x74, 0x1d, 0x90, 0x26, 0x4a, 0xc3,
+ 0xa1, 0x60, 0xf4, 0x1d, 0xd5, 0x3c, 0x86, 0xe8,
+ 0x00, 0xb3, 0x99, 0x27, 0xb8, 0x9d, 0x3e, 0x17,
+ 0x32, 0x5a, 0x34, 0x3e, 0xc2, 0xb2, 0x6e, 0xbd,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x5a, 0x1f, 0x40, 0x5f, 0xee, 0xf2, 0xdd, 0x67,
+ 0x01, 0xcb, 0x26, 0x58, 0xf5, 0x1b, 0xe8, 0x7e,
+ 0xeb, 0x7d, 0x9d, 0xef, 0xd3, 0x55, 0xd6, 0x89,
+ 0x2e, 0xfc, 0x14, 0xe2, 0x98, 0x4c, 0x31, 0xaa,
+ 0x69, 0x00, 0xf9, 0x4e, 0xb0, 0x75, 0x1b, 0x71,
+ 0x93, 0x60, 0xdf, 0xa1, 0xaf, 0xba, 0xc2, 0xd3,
+ 0x6a, 0x22, 0xa0, 0xff, 0xb5, 0x66, 0x15, 0x66,
+ 0xd2, 0x24, 0x9a, 0x7e, 0xe4, 0xe5, 0x84, 0xdb,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x32, 0xbd, 0xcf, 0x72, 0xa9, 0x74, 0x87, 0xe6,
+ 0x2a, 0x53, 0x7e, 0x6d, 0xac, 0xc2, 0xdd, 0x2c,
+ 0x87, 0xb3, 0xf7, 0x90, 0x29, 0xc9, 0x16, 0x59,
+ 0xd2, 0x7e, 0x6e, 0x84, 0x1d, 0xa6, 0xaf, 0x3c,
+ 0xca, 0xd6, 0x1a, 0x24, 0xa4, 0xcd, 0x59, 0x44,
+ 0x20, 0xd7, 0xd2, 0x5b, 0x97, 0xda, 0xd5, 0xa9,
+ 0x23, 0xb1, 0xa4, 0x60, 0xb8, 0x05, 0x98, 0xdc,
+ 0xef, 0x89, 0x81, 0xe3, 0x3a, 0xf9, 0x24, 0x37,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x96, 0x3a, 0x1a, 0xdd, 0x1b, 0xeb, 0x1a, 0x55,
+ 0x24, 0x52, 0x3d, 0xec, 0x9d, 0x52, 0x2e, 0xa6,
+ 0xfe, 0x81, 0xd6, 0x98, 0xac, 0xcc, 0x60, 0x56,
+ 0x04, 0x9d, 0xa3, 0xf3, 0x56, 0x05, 0xe4, 0x8a,
+ 0x61, 0xaf, 0x6f, 0x6e, 0x8e, 0x75, 0x67, 0x3a,
+ 0xd2, 0xb0, 0x85, 0x2d, 0x17, 0xd2, 0x86, 0x8c,
+ 0x50, 0x4b, 0xdd, 0xef, 0x35, 0x00, 0xde, 0x29,
+ 0x3d, 0x4b, 0x04, 0x12, 0x8a, 0x81, 0xe2, 0xcc,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x9c, 0x6e, 0xf0, 0x6f, 0x71, 0x77, 0xd5, 0xd0,
+ 0xbb, 0x70, 0x1f, 0xcb, 0xbd, 0xd3, 0xfe, 0x23,
+ 0x71, 0x78, 0xad, 0x3a, 0xd2, 0x1e, 0x34, 0xf4,
+ 0x6d, 0xb4, 0xa2, 0x0a, 0x24, 0xcb, 0xe1, 0x99,
+ 0x07, 0xd0, 0x79, 0x8f, 0x7e, 0x69, 0x31, 0x68,
+ 0x29, 0xb5, 0x85, 0x82, 0x67, 0xdc, 0x4a, 0x8d,
+ 0x44, 0x04, 0x02, 0xc0, 0xfb, 0xd2, 0x19, 0x66,
+ 0x1e, 0x25, 0x8b, 0xd2, 0x5a, 0x59, 0x68, 0xc0,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0xb8, 0x8f, 0xa8, 0x29, 0x4d, 0xcf, 0x5f, 0x73,
+ 0x3c, 0x55, 0x43, 0xd9, 0x1c, 0xbc, 0x0c, 0x17,
+ 0x75, 0x0b, 0xc7, 0xb1, 0x1d, 0x9f, 0x7b, 0x2f,
+ 0x4c, 0x3d, 0x2a, 0x71, 0xfb, 0x1b, 0x0e, 0xca,
+ 0x4e, 0x96, 0xa0, 0x95, 0xee, 0xf4, 0x93, 0x76,
+ 0x36, 0xfb, 0x5d, 0xd3, 0x46, 0xc4, 0x1d, 0x41,
+ 0x32, 0x92, 0x9d, 0xed, 0xdb, 0x7f, 0xfa, 0xb3,
+ 0x91, 0x61, 0x3e, 0xd6, 0xb2, 0xca, 0x8d, 0x81,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x54, 0xac, 0x1a, 0xa1, 0xa6, 0xa3, 0x47, 0x2a,
+ 0x5a, 0xac, 0x1a, 0x3a, 0x4b, 0xa1, 0x11, 0x08,
+ 0xa7, 0x90, 0xec, 0x52, 0xcb, 0xaf, 0x68, 0x41,
+ 0x38, 0x44, 0x53, 0x94, 0x93, 0x30, 0xaf, 0x3a,
+ 0xec, 0x99, 0x3a, 0x7d, 0x2a, 0xd5, 0xb6, 0x05,
+ 0xf5, 0xa6, 0xbb, 0x9b, 0x82, 0xc2, 0xbd, 0x98,
+ 0x28, 0x62, 0x98, 0x3e, 0xe4, 0x27, 0x9b, 0xaa,
+ 0xce, 0x0a, 0x6f, 0xab, 0x1b, 0x16, 0xf3, 0xdd,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x04, 0x37, 0x60, 0xbc, 0xb3, 0xb1, 0xc6, 0x2d,
+ 0x42, 0xc5, 0xd7, 0x7e, 0xd9, 0x86, 0x82, 0xe0,
+ 0xf4, 0x62, 0xad, 0x75, 0x68, 0x0b, 0xc7, 0xa8,
+ 0xd6, 0x9a, 0x76, 0xe5, 0x29, 0xb8, 0x37, 0x30,
+ 0x0f, 0xc0, 0xbc, 0x81, 0x94, 0x7c, 0x13, 0xf4,
+ 0x9c, 0x27, 0xbc, 0x59, 0xa1, 0x70, 0x6a, 0x87,
+ 0x20, 0x12, 0x0a, 0x2a, 0x62, 0x5e, 0x6f, 0xca,
+ 0x91, 0x6b, 0x34, 0x7e, 0x4c, 0x0d, 0xf0, 0x6c,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x4b, 0x7c, 0x1f, 0x53, 0x52, 0xcc, 0x30, 0xed,
+ 0x91, 0x44, 0x6f, 0x0d, 0xb5, 0x41, 0x79, 0x99,
+ 0xaf, 0x82, 0x65, 0x52, 0x03, 0xf8, 0x55, 0x74,
+ 0x7c, 0xd9, 0x41, 0xd6, 0xe8, 0x91, 0xa4, 0x85,
+ 0xcb, 0x0a, 0x60, 0x08, 0x76, 0x07, 0x60, 0x99,
+ 0x89, 0x76, 0xba, 0x84, 0xbd, 0x0b, 0xf2, 0xb3,
+ 0xdc, 0xf3, 0x33, 0xd1, 0x9b, 0x0b, 0x2e, 0x5d,
+ 0xf6, 0x9d, 0x0f, 0x67, 0xf4, 0x86, 0xb3, 0xd5,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x7d, 0x83, 0x78, 0x6a, 0x5d, 0x52, 0x42, 0x2a,
+ 0xb1, 0x97, 0xc6, 0x62, 0xa2, 0x2a, 0x7c, 0x8b,
+ 0xcd, 0x4f, 0xa4, 0x86, 0x19, 0xa4, 0x5b, 0x1d,
+ 0xc7, 0x6f, 0x2f, 0x9c, 0x03, 0xc3, 0x45, 0x2e,
+ 0xa7, 0x8e, 0x38, 0xf2, 0x57, 0x55, 0x89, 0x47,
+ 0xed, 0xeb, 0x81, 0xe2, 0xe0, 0x55, 0x9f, 0xe6,
+ 0xca, 0x03, 0x59, 0xd3, 0xd4, 0xba, 0xc9, 0x2d,
+ 0xaf, 0xbb, 0x62, 0x2e, 0xe6, 0x89, 0xe4, 0x11,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0xe9, 0x14, 0xe7, 0x01, 0xd0, 0x81, 0x09, 0x51,
+ 0x78, 0x1c, 0x8e, 0x6c, 0x00, 0xd3, 0x28, 0xa0,
+ 0x2a, 0x7b, 0xd6, 0x25, 0xca, 0xd0, 0xf9, 0xb8,
+ 0xd8, 0xcf, 0xd0, 0xb7, 0x48, 0x25, 0xb7, 0x6a,
+ 0x53, 0x8e, 0xf8, 0x52, 0x9c, 0x1f, 0x7d, 0xae,
+ 0x4c, 0x22, 0xd5, 0x9d, 0xf0, 0xaf, 0x98, 0x91,
+ 0x19, 0x1f, 0x99, 0xbd, 0xa6, 0xc2, 0x0f, 0x05,
+ 0xa5, 0x9f, 0x3e, 0x87, 0xed, 0xc3, 0xab, 0x92,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x2e, 0xf4, 0x72, 0xd2, 0xd9, 0x4a, 0xd5, 0xf9,
+ 0x20, 0x03, 0x4a, 0xad, 0xed, 0xa9, 0x1b, 0x64,
+ 0x73, 0x38, 0xc6, 0x30, 0xa8, 0x7f, 0xf9, 0x3b,
+ 0x8c, 0xbc, 0xa1, 0x2d, 0x22, 0x7b, 0x84, 0x37,
+ 0xf5, 0xba, 0xee, 0xf0, 0x80, 0x9d, 0xe3, 0x82,
+ 0xbd, 0x07, 0x68, 0x15, 0x01, 0x22, 0xf6, 0x88,
+ 0x07, 0x0b, 0xfd, 0xb7, 0xb1, 0xc0, 0x68, 0x4b,
+ 0x8d, 0x05, 0xec, 0xfb, 0xcd, 0xde, 0xa4, 0x2a,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x73, 0xe3, 0xe5, 0x87, 0x01, 0x0a, 0x29, 0x4d,
+ 0xad, 0x92, 0x67, 0x64, 0xc7, 0x71, 0x0b, 0x22,
+ 0x80, 0x8a, 0x6e, 0x8b, 0x20, 0x73, 0xb2, 0xd7,
+ 0x98, 0x20, 0x35, 0x42, 0x42, 0x5d, 0x85, 0x12,
+ 0xb0, 0x06, 0x69, 0x63, 0x5f, 0x5b, 0xe7, 0x63,
+ 0x6f, 0xe6, 0x18, 0xa6, 0xc1, 0xa6, 0xae, 0x27,
+ 0xa7, 0x6a, 0x73, 0x6b, 0x27, 0xd5, 0x47, 0xe1,
+ 0xa2, 0x7d, 0xe4, 0x0d, 0xbd, 0x23, 0x7b, 0x7a,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x11, 0x5b, 0x77, 0x36, 0x6b, 0x3b, 0xe4, 0x42,
+ 0xe4, 0x92, 0x23, 0xcb, 0x0c, 0x06, 0xff, 0xb7,
+ 0x0c, 0x71, 0x64, 0xd9, 0x8a, 0x57, 0x75, 0x7b,
+ 0xa2, 0xd2, 0x17, 0x19, 0xbb, 0xb5, 0x3c, 0xb3,
+ 0x5f, 0xae, 0x35, 0x75, 0x8e, 0xa8, 0x97, 0x43,
+ 0xce, 0xfe, 0x41, 0x84, 0xfe, 0xcb, 0x18, 0x70,
+ 0x68, 0x2e, 0x16, 0x19, 0xd5, 0x10, 0x0d, 0x2f,
+ 0x61, 0x87, 0x79, 0xee, 0x5f, 0x24, 0xdd, 0x76,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x9e, 0x96, 0xe1, 0x0a, 0xb2, 0xd5, 0xba, 0xcf,
+ 0x27, 0xba, 0x6f, 0x85, 0xe9, 0xbf, 0x96, 0xb9,
+ 0x5a, 0x00, 0x00, 0x06, 0xdc, 0xb7, 0xaf, 0x0a,
+ 0x8f, 0x1d, 0x31, 0xf6, 0xce, 0xc3, 0x50, 0x2e,
+ 0x61, 0x3a, 0x8b, 0x28, 0xaf, 0xb2, 0x50, 0x0d,
+ 0x00, 0x98, 0x02, 0x11, 0x6b, 0xfa, 0x51, 0xc1,
+ 0xde, 0xe1, 0x34, 0x9f, 0xda, 0x11, 0x63, 0xfa,
+ 0x0a, 0xa0, 0xa2, 0x67, 0x39, 0xeb, 0x9b, 0xf1,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x46, 0x4e, 0x81, 0xd1, 0x08, 0x2a, 0x46, 0x12,
+ 0x4e, 0xae, 0x1f, 0x5d, 0x57, 0xe5, 0x19, 0xbc,
+ 0x76, 0x38, 0xb6, 0xa7, 0xe3, 0x72, 0x6d, 0xaf,
+ 0x80, 0x3b, 0xd0, 0xbc, 0x06, 0xe8, 0xab, 0xab,
+ 0x86, 0x4b, 0x0b, 0x7a, 0x61, 0xa6, 0x13, 0xff,
+ 0x64, 0x47, 0x89, 0xb7, 0x63, 0x8a, 0xa5, 0x4c,
+ 0x9f, 0x52, 0x70, 0xeb, 0x21, 0xe5, 0x2d, 0xe9,
+ 0xe7, 0xab, 0x1c, 0x0e, 0x74, 0xf5, 0x72, 0xec,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0xfa, 0x6e, 0xff, 0x3c, 0xc1, 0x98, 0x49, 0x42,
+ 0x34, 0x67, 0xd4, 0xd3, 0xfa, 0xae, 0x27, 0xe4,
+ 0x77, 0x11, 0x84, 0xd2, 0x57, 0x99, 0xf8, 0xfd,
+ 0x41, 0x50, 0x84, 0x80, 0x7f, 0xf7, 0xb2, 0xd3,
+ 0x88, 0x21, 0x9c, 0xe8, 0xb9, 0x05, 0xd3, 0x48,
+ 0x64, 0xc5, 0xb7, 0x29, 0xd9, 0x21, 0x17, 0xad,
+ 0x89, 0x9c, 0x79, 0x55, 0x51, 0x0b, 0x96, 0x3e,
+ 0x10, 0x40, 0xe1, 0xdd, 0x7b, 0x39, 0x40, 0x86,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0x41, 0xb3, 0xd2, 0x93, 0xcd, 0x79, 0x84, 0xc2,
+ 0xf5, 0xea, 0xf3, 0xb3, 0x94, 0x23, 0xaa, 0x76,
+ 0x87, 0x5f, 0xe3, 0xd2, 0x03, 0xd8, 0x00, 0xbb,
+ 0xa1, 0x55, 0xe4, 0xcb, 0x16, 0x04, 0x5b, 0xdf,
+ 0xf8, 0xd2, 0x63, 0x51, 0x02, 0x22, 0xc6, 0x0f,
+ 0x98, 0x2b, 0x12, 0x52, 0x25, 0x64, 0x93, 0xd9,
+ 0xab, 0xe9, 0x4d, 0x16, 0x4b, 0xf6, 0x09, 0x83,
+ 0x5c, 0x63, 0x1c, 0x41, 0x19, 0xf6, 0x76, 0xe3,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA512_DIGEST_SIZE] = {
+ 0x5b, 0x9d, 0xf9, 0xab, 0x8c, 0x8e, 0x52, 0xdb,
+ 0x02, 0xa0, 0x4c, 0x24, 0x2d, 0xc4, 0xa8, 0x4e,
+ 0x9c, 0x93, 0x2f, 0x72, 0xa8, 0x75, 0xfb, 0xb5,
+ 0xdb, 0xef, 0x52, 0xc6, 0xa3, 0xfe, 0xeb, 0x6b,
+ 0x92, 0x79, 0x18, 0x05, 0xf6, 0xd7, 0xaf, 0x7b,
+ 0x36, 0xfc, 0x83, 0x2c, 0x7e, 0x7b, 0x59, 0x8b,
+ 0xf9, 0x81, 0xaa, 0x98, 0x38, 0x11, 0x97, 0x56,
+ 0x34, 0xe5, 0x2a, 0x4b, 0xf2, 0x9e, 0xf3, 0xdb,
+};
+
+static const u8 hmac_testvec_consolidated[SHA512_DIGEST_SIZE] = {
+ 0x40, 0xe7, 0xbc, 0x03, 0xdf, 0x22, 0xd4, 0x76,
+ 0x66, 0x45, 0xf8, 0x1d, 0x25, 0xdf, 0xbe, 0xa2,
+ 0x93, 0x06, 0x8c, 0x1d, 0x14, 0x23, 0x9b, 0x5c,
+ 0xfa, 0xac, 0xdf, 0xbd, 0xa2, 0x24, 0xe5, 0xf7,
+ 0xdc, 0xf7, 0xae, 0x96, 0xc1, 0x34, 0xe5, 0x24,
+ 0x16, 0x24, 0xdc, 0xee, 0x4f, 0x62, 0x1c, 0x67,
+ 0x4e, 0x02, 0x31, 0x4b, 0x9b, 0x65, 0x25, 0xeb,
+ 0x32, 0x2e, 0x24, 0xfb, 0xcd, 0x2b, 0x59, 0xd8,
+};
diff --git a/lib/crypto/tests/sha512_kunit.c b/lib/crypto/tests/sha512_kunit.c
new file mode 100644
index 000000000000..8923e2d7d3d4
--- /dev/null
+++ b/lib/crypto/tests/sha512_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha512-testvecs.h"
+
+#define HASH sha512
+#define HASH_CTX sha512_ctx
+#define HASH_SIZE SHA512_DIGEST_SIZE
+#define HASH_INIT sha512_init
+#define HASH_UPDATE sha512_update
+#define HASH_FINAL sha512_final
+#define HMAC_KEY hmac_sha512_key
+#define HMAC_CTX hmac_sha512_ctx
+#define HMAC_PREPAREKEY hmac_sha512_preparekey
+#define HMAC_INIT hmac_sha512_init
+#define HMAC_UPDATE hmac_sha512_update
+#define HMAC_FINAL hmac_sha512_final
+#define HMAC hmac_sha512
+#define HMAC_USINGRAWKEY hmac_sha512_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "sha512",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-512 and HMAC-SHA512");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/utils.c b/lib/crypto/utils.c
index 87da2a6dd161..dec381d5e906 100644
--- a/lib/crypto/utils.c
+++ b/lib/crypto/utils.c
@@ -5,9 +5,10 @@
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
*/
-#include <linux/unaligned.h>
#include <crypto/utils.h>
+#include <linux/export.h>
#include <linux/module.h>
+#include <linux/unaligned.h>
/*
* XOR @len bytes from @src1 and @src2 together, writing the result to @dst
diff --git a/lib/crypto/x86/.gitignore b/lib/crypto/x86/.gitignore
new file mode 100644
index 000000000000..580c839bb177
--- /dev/null
+++ b/lib/crypto/x86/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-x86_64-cryptogams.S
diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
new file mode 100644
index 000000000000..7b1d98ca7482
--- /dev/null
+++ b/lib/crypto/x86/blake2s-core.S
@@ -0,0 +1,291 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.iv, "aM", @progbits, 32
+.align 32
+.Liv:
+ .octa 0xA54FF53A3C6EF372BB67AE856A09E667
+ .octa 0x5BE0CD191F83D9AB9B05688C510E527F
+
+.section .rodata.cst16.ror16, "aM", @progbits, 16
+.align 16
+.Lror16:
+ .octa 0x0D0C0F0E09080B0A0504070601000302
+
+.section .rodata.cst16.ror8, "aM", @progbits, 16
+.align 16
+.Lror8:
+ .octa 0x0C0F0E0D080B0A090407060500030201
+
+.section .rodata.cst64.sigma, "aM", @progbits, 160
+.align 64
+.Lsigma:
+.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
+.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
+.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
+.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
+.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
+.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
+.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
+.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
+.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
+.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
+
+.section .rodata.cst64.sigma2, "aM", @progbits, 160
+.align 64
+.Lsigma2:
+.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
+.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
+.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
+.byte 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
+.byte 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
+.byte 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
+.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
+.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
+.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
+.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
+
+#define CTX %rdi
+#define DATA %rsi
+#define NBLOCKS %rdx
+#define INC %ecx
+
+.text
+//
+// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+// const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+// u32 h[8]; (inout)
+// u32 t[2]; (inout)
+// u32 f[2]; (in)
+//
+SYM_FUNC_START(blake2s_compress_ssse3)
+ movdqu (CTX),%xmm0 // Load h[0..3]
+ movdqu 16(CTX),%xmm1 // Load h[4..7]
+ movdqa .Lror16(%rip),%xmm12
+ movdqa .Lror8(%rip),%xmm13
+ movdqu 32(CTX),%xmm14 // Load t and f
+ movd INC,%xmm15 // Load inc
+ leaq .Lsigma+160(%rip),%r8
+ jmp .Lssse3_mainloop
+
+ .align 32
+.Lssse3_mainloop:
+ // Main loop: each iteration processes one 64-byte block.
+ movdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
+ movdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
+ paddq %xmm15,%xmm14 // t += inc (64-bit addition)
+ movdqa .Liv(%rip),%xmm2 // v[8..11] = iv[0..3]
+ movdqa %xmm14,%xmm3
+ pxor .Liv+16(%rip),%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
+ leaq .Lsigma(%rip),%rcx
+
+.Lssse3_roundloop:
+ // Round loop: each iteration does 1 round (of 10 rounds total).
+ movzbl (%rcx),%eax
+ movd (DATA,%rax,4),%xmm4
+ movzbl 1(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
+ movzbl 2(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
+ movzbl 3(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
+ punpckldq %xmm5,%xmm4
+ punpckldq %xmm7,%xmm6
+ punpcklqdq %xmm6,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $12,%xmm1
+ pslld $20,%xmm8
+ por %xmm8,%xmm1
+ movzbl 4(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
+ movzbl 5(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
+ movzbl 6(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
+ movzbl 7(%rcx),%eax
+ movd (DATA,%rax,4),%xmm4
+ punpckldq %xmm6,%xmm5
+ punpckldq %xmm4,%xmm7
+ punpcklqdq %xmm7,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $7,%xmm1
+ pslld $25,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movzbl 8(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
+ movzbl 9(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
+ movzbl 10(%rcx),%eax
+ movd (DATA,%rax,4),%xmm4
+ movzbl 11(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
+ punpckldq %xmm7,%xmm6
+ punpckldq %xmm5,%xmm4
+ punpcklqdq %xmm4,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $12,%xmm1
+ pslld $20,%xmm8
+ por %xmm8,%xmm1
+ movzbl 12(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
+ movzbl 13(%rcx),%eax
+ movd (DATA,%rax,4),%xmm4
+ movzbl 14(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
+ movzbl 15(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
+ punpckldq %xmm4,%xmm7
+ punpckldq %xmm6,%xmm5
+ punpcklqdq %xmm5,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $7,%xmm1
+ pslld $25,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ addq $16,%rcx
+ cmpq %r8,%rcx
+ jnz .Lssse3_roundloop
+
+ // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm1
+ pxor %xmm10,%xmm0
+ pxor %xmm11,%xmm1
+ addq $64,DATA
+ decq NBLOCKS
+ jnz .Lssse3_mainloop
+
+ movdqu %xmm0,(CTX) // Store new h[0..3]
+ movdqu %xmm1,16(CTX) // Store new h[4..7]
+ movq %xmm14,32(CTX) // Store new t (f is unchanged)
+ RET
+SYM_FUNC_END(blake2s_compress_ssse3)
+
+//
+// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+// const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+// u32 h[8]; (inout)
+// u32 t[2]; (inout)
+// u32 f[2]; (in)
+//
+SYM_FUNC_START(blake2s_compress_avx512)
+ vmovdqu (CTX),%xmm0 // Load h[0..3]
+ vmovdqu 16(CTX),%xmm1 // Load h[4..7]
+ vmovdqu 32(CTX),%xmm4 // Load t and f
+ vmovd INC,%xmm5 // Load inc
+ vmovdqa .Liv(%rip),%xmm14 // Load iv[0..3]
+ vmovdqa .Liv+16(%rip),%xmm15 // Load iv[4..7]
+ jmp .Lavx512_mainloop
+
+ .align 32
+.Lavx512_mainloop:
+ // Main loop: each iteration processes one 64-byte block.
+ vmovdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
+ vmovdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
+ vpaddq %xmm5,%xmm4,%xmm4 // t += inc (64-bit addition)
+ vmovdqa %xmm14,%xmm2 // v[8..11] = iv[0..3]
+ vpxor %xmm15,%xmm4,%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
+ vmovdqu (DATA),%ymm6 // Load first 8 data words
+ vmovdqu 32(DATA),%ymm7 // Load second 8 data words
+ addq $64,DATA
+ leaq .Lsigma2(%rip),%rax
+ movb $10,%cl // Set num rounds remaining
+
+.Lavx512_roundloop:
+ // Round loop: each iteration does 1 round (of 10 rounds total).
+ vpmovzxbd (%rax),%ymm8
+ vpmovzxbd 8(%rax),%ymm9
+ addq $16,%rax
+ vpermi2d %ymm7,%ymm6,%ymm8
+ vpermi2d %ymm7,%ymm6,%ymm9
+ vmovdqa %ymm8,%ymm6
+ vmovdqa %ymm9,%ymm7
+ vpaddd %xmm8,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $16,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $12,%xmm1,%xmm1
+ vextracti128 $1,%ymm8,%xmm8
+ vpaddd %xmm8,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $7,%xmm1,%xmm1
+ vpshufd $0x93,%xmm0,%xmm0
+ vpshufd $0x4e,%xmm3,%xmm3
+ vpshufd $0x39,%xmm2,%xmm2
+ vpaddd %xmm9,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $16,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $12,%xmm1,%xmm1
+ vextracti128 $1,%ymm9,%xmm9
+ vpaddd %xmm9,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $7,%xmm1,%xmm1
+ vpshufd $0x39,%xmm0,%xmm0
+ vpshufd $0x4e,%xmm3,%xmm3
+ vpshufd $0x93,%xmm2,%xmm2
+ decb %cl
+ jne .Lavx512_roundloop
+
+ // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
+ vpternlogd $0x96,%xmm10,%xmm2,%xmm0
+ vpternlogd $0x96,%xmm11,%xmm3,%xmm1
+ decq NBLOCKS
+ jne .Lavx512_mainloop
+
+ vmovdqu %xmm0,(CTX) // Store new h[0..3]
+ vmovdqu %xmm1,16(CTX) // Store new h[4..7]
+ vmovq %xmm4,32(CTX) // Store new t (f is unchanged)
+ vzeroupper
+ RET
+SYM_FUNC_END(blake2s_compress_avx512)
diff --git a/lib/crypto/x86/blake2s.h b/lib/crypto/x86/blake2s.h
new file mode 100644
index 000000000000..f8eed6cb042e
--- /dev/null
+++ b/lib/crypto/x86/blake2s.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
+#include <asm/processor.h>
+#include <asm/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+asmlinkage void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc);
+asmlinkage void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
+
+static void blake2s_compress(struct blake2s_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc)
+{
+ /* SIMD disables preemption, so relax after processing each page. */
+ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
+
+ if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
+ blake2s_compress_generic(ctx, data, nblocks, inc);
+ return;
+ }
+
+ do {
+ const size_t blocks = min_t(size_t, nblocks,
+ SZ_4K / BLAKE2S_BLOCK_SIZE);
+
+ kernel_fpu_begin();
+ if (static_branch_likely(&blake2s_use_avx512))
+ blake2s_compress_avx512(ctx, data, blocks, inc);
+ else
+ blake2s_compress_ssse3(ctx, data, blocks, inc);
+ kernel_fpu_end();
+
+ data += blocks * BLAKE2S_BLOCK_SIZE;
+ nblocks -= blocks;
+ } while (nblocks);
+}
+
+#define blake2s_mod_init_arch blake2s_mod_init_arch
+static void blake2s_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ static_branch_enable(&blake2s_use_ssse3);
+
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL))
+ static_branch_enable(&blake2s_use_avx512);
+}
diff --git a/lib/crypto/x86/chacha-avx2-x86_64.S b/lib/crypto/x86/chacha-avx2-x86_64.S
new file mode 100644
index 000000000000..f3d8fc018249
--- /dev/null
+++ b/lib/crypto/x86/chacha-avx2-x86_64.S
@@ -0,0 +1,1021 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.ROT8, "aM", @progbits, 32
+.align 32
+ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
+ .octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section .rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+ .octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section .rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
+CTRINC: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
+.text
+
+SYM_FUNC_START(chacha_2block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts two ChaCha blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+ vmovdqa ROT8(%rip),%ymm4
+ vmovdqa ROT16(%rip),%ymm5
+
+ mov %rcx,%rax
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rax
+ jl .Lxorpart2
+ vpxor 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rax
+ jl .Lxorpart2
+ vpxor 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rax
+ jl .Lxorpart2
+ vpxor 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rax
+ jl .Lxorpart2
+ vpxor 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rax
+ jl .Lxorpart2
+ vpxor 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rax
+ jl .Lxorpart2
+ vpxor 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rax
+ jl .Lxorpart2
+ vpxor 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rax
+ jl .Lxorpart2
+ vpxor 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ RET
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone2
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm7,%xmm7
+ vmovdqa %xmm7,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone2
+
+SYM_FUNC_END(chacha_2block_xor_avx2)
+
+SYM_FUNC_START(chacha_4block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four ChaCha blocks by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+ vmovdqa ROT8(%rip),%ymm8
+ vmovdqa ROT16(%rip),%ymm9
+
+ mov %rcx,%rax
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rax
+ jl .Lxorpart4
+ vpxor 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ vpxor 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ vpxor 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ vpxor 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rax
+ jl .Lxorpart4
+ vpxor 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ vpxor 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ vpxor 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ vpxor 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ vpxor 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ vpxor 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ vpxor 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ vpxor 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rax
+ jl .Lxorpart4
+ vpxor 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rax
+ jl .Lxorpart4
+ vpxor 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rax
+ jl .Lxorpart4
+ vpxor 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rax
+ jl .Lxorpart4
+ vpxor 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ RET
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm10,%xmm10
+ vmovdqa %xmm10,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_avx2)
+
+SYM_FUNC_START(chacha_8block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts eight consecutive ChaCha blocks by loading
+ # the state matrix in AVX registers eight times. As we need some
+ # scratch registers, we save the first four registers on the stack. The
+ # algorithm performs each operation on the corresponding word of each
+ # state matrix, hence requires no word shuffling. For final XORing step
+ # we transpose the matrix by interleaving 32-, 64- and then 128-bit
+ # words, which allows us to do XOR in AVX registers. 8/16-bit word
+ # rotation is done with the slightly better performing byte shuffling,
+ # 7/12-bit word rotation uses traditional shift+OR.
+
+ vzeroupper
+ # 4 * 32 byte stack, 32-byte aligned
+ lea 8(%rsp),%r10
+ and $~31, %rsp
+ sub $0x80, %rsp
+ mov %rcx,%rax
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+ # x0..3 on stack
+ vmovdqa %ymm0,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm3,0x60(%rsp)
+
+ vmovdqa CTRINC(%rip),%ymm1
+ vmovdqa ROT8(%rip),%ymm2
+ vmovdqa ROT16(%rip),%ymm3
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpaddd 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpbroadcastd 0x04(%rdi),%ymm0
+ vpaddd 0x20(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpbroadcastd 0x08(%rdi),%ymm0
+ vpaddd 0x40(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpbroadcastd 0x0c(%rdi),%ymm0
+ vpaddd 0x60(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpbroadcastd 0x10(%rdi),%ymm0
+ vpaddd %ymm0,%ymm4,%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm0
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm0
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm7,%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm0
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm0
+ vpaddd %ymm0,%ymm9,%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm0
+ vpaddd %ymm0,%ymm10,%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm0
+ vpaddd %ymm0,%ymm12,%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm0
+ vpaddd %ymm0,%ymm13,%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm0
+ vpaddd %ymm0,%ymm14,%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm15,%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+ # interleave 32-bit words in state n, n+1
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x20(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa 0x40(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm1,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpckldq %ymm5,%ymm0,%ymm4
+ vpunpckhdq %ymm5,%ymm0,%ymm5
+ vmovdqa %ymm6,%ymm0
+ vpunpckldq %ymm7,%ymm0,%ymm6
+ vpunpckhdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpckldq %ymm9,%ymm0,%ymm8
+ vpunpckhdq %ymm9,%ymm0,%ymm9
+ vmovdqa %ymm10,%ymm0
+ vpunpckldq %ymm11,%ymm0,%ymm10
+ vpunpckhdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpckldq %ymm13,%ymm0,%ymm12
+ vpunpckhdq %ymm13,%ymm0,%ymm13
+ vmovdqa %ymm14,%ymm0
+ vpunpckldq %ymm15,%ymm0,%ymm14
+ vpunpckhdq %ymm15,%ymm0,%ymm15
+
+ # interleave 64-bit words in state n, n+2
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x40(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x00(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa 0x20(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpcklqdq %ymm6,%ymm0,%ymm4
+ vpunpckhqdq %ymm6,%ymm0,%ymm6
+ vmovdqa %ymm5,%ymm0
+ vpunpcklqdq %ymm7,%ymm0,%ymm5
+ vpunpckhqdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpcklqdq %ymm10,%ymm0,%ymm8
+ vpunpckhqdq %ymm10,%ymm0,%ymm10
+ vmovdqa %ymm9,%ymm0
+ vpunpcklqdq %ymm11,%ymm0,%ymm9
+ vpunpckhqdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpcklqdq %ymm14,%ymm0,%ymm12
+ vpunpckhqdq %ymm14,%ymm0,%ymm14
+ vmovdqa %ymm13,%ymm0
+ vpunpcklqdq %ymm15,%ymm0,%ymm13
+ vpunpckhqdq %ymm15,%ymm0,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ # xor/write first four blocks
+ vmovdqa 0x00(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
+ cmp $0x0020,%rax
+ jl .Lxorpart8
+ vpxor 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0000(%rsi)
+ vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
+
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rax
+ jl .Lxorpart8
+ vpxor 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0020(%rsi)
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+
+ vmovdqa 0x40(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
+ cmp $0x0060,%rax
+ jl .Lxorpart8
+ vpxor 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
+
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rax
+ jl .Lxorpart8
+ vpxor 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0060(%rsi)
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+
+ vmovdqa 0x20(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rax
+ jl .Lxorpart8
+ vpxor 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0080(%rsi)
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rax
+ jl .Lxorpart8
+ vpxor 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vmovdqa 0x60(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
+ cmp $0x00e0,%rax
+ jl .Lxorpart8
+ vpxor 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00c0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rax
+ jl .Lxorpart8
+ vpxor 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa %ymm4,%ymm0
+ cmp $0x0120,%rax
+ jl .Lxorpart8
+ vpxor 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0100(%rsi)
+
+ vmovdqa %ymm12,%ymm0
+ cmp $0x0140,%rax
+ jl .Lxorpart8
+ vpxor 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0120(%rsi)
+
+ vmovdqa %ymm6,%ymm0
+ cmp $0x0160,%rax
+ jl .Lxorpart8
+ vpxor 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0140(%rsi)
+
+ vmovdqa %ymm14,%ymm0
+ cmp $0x0180,%rax
+ jl .Lxorpart8
+ vpxor 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0160(%rsi)
+
+ vmovdqa %ymm5,%ymm0
+ cmp $0x01a0,%rax
+ jl .Lxorpart8
+ vpxor 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0180(%rsi)
+
+ vmovdqa %ymm13,%ymm0
+ cmp $0x01c0,%rax
+ jl .Lxorpart8
+ vpxor 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01a0(%rsi)
+
+ vmovdqa %ymm7,%ymm0
+ cmp $0x01e0,%rax
+ jl .Lxorpart8
+ vpxor 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01c0(%rsi)
+
+ vmovdqa %ymm15,%ymm0
+ cmp $0x0200,%rax
+ jl .Lxorpart8
+ vpxor 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+ vzeroupper
+ lea -8(%r10),%rsp
+ RET
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x1f,%r9
+ jz .Ldone8
+ and $~0x1f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone8
+
+SYM_FUNC_END(chacha_8block_xor_avx2)
diff --git a/lib/crypto/x86/chacha-avx512vl-x86_64.S b/lib/crypto/x86/chacha-avx512vl-x86_64.S
new file mode 100644
index 000000000000..259383e1ad44
--- /dev/null
+++ b/lib/crypto/x86/chacha-avx512vl-x86_64.S
@@ -0,0 +1,836 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
+.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.text
+
+SYM_FUNC_START(chacha_2block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts two ChaCha blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rcx
+ jl .Lxorpart2
+ vpxord 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rcx
+ jl .Lxorpart2
+ vpxord 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rcx
+ jl .Lxorpart2
+ vpxord 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rcx
+ jl .Lxorpart2
+ vpxord 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rcx
+ jl .Lxorpart2
+ vpxord 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rcx
+ jl .Lxorpart2
+ vpxord 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rcx
+ jl .Lxorpart2
+ vpxord 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rcx
+ jl .Lxorpart2
+ vpxord 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ RET
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone2
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm7,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone2
+
+SYM_FUNC_END(chacha_2block_xor_avx512vl)
+
+SYM_FUNC_START(chacha_4block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four ChaCha blocks by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rcx
+ jl .Lxorpart4
+ vpxord 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rcx
+ jl .Lxorpart4
+ vpxord 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rcx
+ jl .Lxorpart4
+ vpxord 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rcx
+ jl .Lxorpart4
+ vpxord 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rcx
+ jl .Lxorpart4
+ vpxord 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rcx
+ jl .Lxorpart4
+ vpxord 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rcx
+ jl .Lxorpart4
+ vpxord 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rcx
+ jl .Lxorpart4
+ vpxord 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rcx
+ jl .Lxorpart4
+ vpxord 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rcx
+ jl .Lxorpart4
+ vpxord 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rcx
+ jl .Lxorpart4
+ vpxord 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rcx
+ jl .Lxorpart4
+ vpxord 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rcx
+ jl .Lxorpart4
+ vpxord 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rcx
+ jl .Lxorpart4
+ vpxord 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rcx
+ jl .Lxorpart4
+ vpxord 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rcx
+ jl .Lxorpart4
+ vpxord 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ RET
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone4
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm10,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_avx512vl)
+
+SYM_FUNC_START(chacha_8block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts eight consecutive ChaCha blocks by loading
+ # the state matrix in AVX registers eight times. Compared to AVX2, this
+ # mostly benefits from the new rotate instructions in VL and the
+ # additional registers.
+
+ vzeroupper
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd CTR8BL(%rip),%ymm12,%ymm12
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ # interleave 32-bit words in state n, n+1
+ vpunpckldq %ymm1,%ymm0,%ymm16
+ vpunpckhdq %ymm1,%ymm0,%ymm17
+ vpunpckldq %ymm3,%ymm2,%ymm18
+ vpunpckhdq %ymm3,%ymm2,%ymm19
+ vpunpckldq %ymm5,%ymm4,%ymm20
+ vpunpckhdq %ymm5,%ymm4,%ymm21
+ vpunpckldq %ymm7,%ymm6,%ymm22
+ vpunpckhdq %ymm7,%ymm6,%ymm23
+ vpunpckldq %ymm9,%ymm8,%ymm24
+ vpunpckhdq %ymm9,%ymm8,%ymm25
+ vpunpckldq %ymm11,%ymm10,%ymm26
+ vpunpckhdq %ymm11,%ymm10,%ymm27
+ vpunpckldq %ymm13,%ymm12,%ymm28
+ vpunpckhdq %ymm13,%ymm12,%ymm29
+ vpunpckldq %ymm15,%ymm14,%ymm30
+ vpunpckhdq %ymm15,%ymm14,%ymm31
+
+ # interleave 64-bit words in state n, n+2
+ vpunpcklqdq %ymm18,%ymm16,%ymm0
+ vpunpcklqdq %ymm19,%ymm17,%ymm1
+ vpunpckhqdq %ymm18,%ymm16,%ymm2
+ vpunpckhqdq %ymm19,%ymm17,%ymm3
+ vpunpcklqdq %ymm22,%ymm20,%ymm4
+ vpunpcklqdq %ymm23,%ymm21,%ymm5
+ vpunpckhqdq %ymm22,%ymm20,%ymm6
+ vpunpckhqdq %ymm23,%ymm21,%ymm7
+ vpunpcklqdq %ymm26,%ymm24,%ymm8
+ vpunpcklqdq %ymm27,%ymm25,%ymm9
+ vpunpckhqdq %ymm26,%ymm24,%ymm10
+ vpunpckhqdq %ymm27,%ymm25,%ymm11
+ vpunpcklqdq %ymm30,%ymm28,%ymm12
+ vpunpcklqdq %ymm31,%ymm29,%ymm13
+ vpunpckhqdq %ymm30,%ymm28,%ymm14
+ vpunpckhqdq %ymm31,%ymm29,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ # xor/write first four blocks
+ vmovdqa64 %ymm0,%ymm16
+ vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
+ cmp $0x0020,%rcx
+ jl .Lxorpart8
+ vpxord 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0000(%rsi)
+ vmovdqa64 %ymm16,%ymm0
+ vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
+
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rcx
+ jl .Lxorpart8
+ vpxord 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0020(%rsi)
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+
+ vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
+ cmp $0x0060,%rcx
+ jl .Lxorpart8
+ vpxord 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
+
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rcx
+ jl .Lxorpart8
+ vpxord 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0060(%rsi)
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0080(%rsi)
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
+ cmp $0x00e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00c0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rcx
+ jl .Lxorpart8
+ vpxord 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa64 %ymm4,%ymm0
+ cmp $0x0120,%rcx
+ jl .Lxorpart8
+ vpxord 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0100(%rsi)
+
+ vmovdqa64 %ymm12,%ymm0
+ cmp $0x0140,%rcx
+ jl .Lxorpart8
+ vpxord 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0120(%rsi)
+
+ vmovdqa64 %ymm6,%ymm0
+ cmp $0x0160,%rcx
+ jl .Lxorpart8
+ vpxord 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0140(%rsi)
+
+ vmovdqa64 %ymm14,%ymm0
+ cmp $0x0180,%rcx
+ jl .Lxorpart8
+ vpxord 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0160(%rsi)
+
+ vmovdqa64 %ymm5,%ymm0
+ cmp $0x01a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0180(%rsi)
+
+ vmovdqa64 %ymm13,%ymm0
+ cmp $0x01c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01a0(%rsi)
+
+ vmovdqa64 %ymm7,%ymm0
+ cmp $0x01e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01c0(%rsi)
+
+ vmovdqa64 %ymm15,%ymm0
+ cmp $0x0200,%rcx
+ jl .Lxorpart8
+ vpxord 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+ vzeroupper
+ RET
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0x1f,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0x1f,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
+ vpxord %ymm0,%ymm1,%ymm1
+ vmovdqu8 %ymm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone8
+
+SYM_FUNC_END(chacha_8block_xor_avx512vl)
diff --git a/lib/crypto/x86/chacha-ssse3-x86_64.S b/lib/crypto/x86/chacha-ssse3-x86_64.S
new file mode 100644
index 000000000000..7111949cd5b9
--- /dev/null
+++ b/lib/crypto/x86/chacha-ssse3-x86_64.S
@@ -0,0 +1,791 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section .rodata.cst16.ROT8, "aM", @progbits, 16
+.align 16
+ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
+.section .rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+.section .rodata.cst16.CTRINC, "aM", @progbits, 16
+.align 16
+CTRINC: .octa 0x00000003000000020000000100000000
+
+.text
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round. 8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * The round count is given in %r8d.
+ *
+ * Clobbers: %r8d, %xmm4-%xmm7
+ */
+SYM_FUNC_START_LOCAL(chacha_permute)
+
+ movdqa ROT8(%rip),%xmm4
+ movdqa ROT16(%rip),%xmm5
+
+.Ldoubleround:
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm5,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm6
+ pslld $12,%xmm6
+ psrld $20,%xmm1
+ por %xmm6,%xmm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm4,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm7
+ pslld $7,%xmm7
+ psrld $25,%xmm1
+ por %xmm7,%xmm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ pshufd $0x39,%xmm1,%xmm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ pshufd $0x4e,%xmm2,%xmm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ pshufd $0x93,%xmm3,%xmm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm5,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm6
+ pslld $12,%xmm6
+ psrld $20,%xmm1
+ por %xmm6,%xmm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm4,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm7
+ pslld $7,%xmm7
+ psrld $25,%xmm1
+ por %xmm7,%xmm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ pshufd $0x93,%xmm1,%xmm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ pshufd $0x4e,%xmm2,%xmm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ pshufd $0x39,%xmm3,%xmm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ RET
+SYM_FUNC_END(chacha_permute)
+
+SYM_FUNC_START(chacha_block_xor_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 1 data block output, o
+ # %rdx: up to 1 data block input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+ FRAME_BEGIN
+
+ # x0..3 = s0..3
+ movdqu 0x00(%rdi),%xmm0
+ movdqu 0x10(%rdi),%xmm1
+ movdqu 0x20(%rdi),%xmm2
+ movdqu 0x30(%rdi),%xmm3
+ movdqa %xmm0,%xmm8
+ movdqa %xmm1,%xmm9
+ movdqa %xmm2,%xmm10
+ movdqa %xmm3,%xmm11
+
+ mov %rcx,%rax
+ call chacha_permute
+
+ # o0 = i0 ^ (x0 + s0)
+ paddd %xmm8,%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart
+ movdqu 0x00(%rdx),%xmm4
+ pxor %xmm4,%xmm0
+ movdqu %xmm0,0x00(%rsi)
+ # o1 = i1 ^ (x1 + s1)
+ paddd %xmm9,%xmm1
+ movdqa %xmm1,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart
+ movdqu 0x10(%rdx),%xmm0
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x10(%rsi)
+ # o2 = i2 ^ (x2 + s2)
+ paddd %xmm10,%xmm2
+ movdqa %xmm2,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart
+ movdqu 0x20(%rdx),%xmm0
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,0x20(%rsi)
+ # o3 = i3 ^ (x3 + s3)
+ paddd %xmm11,%xmm3
+ movdqa %xmm3,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart
+ movdqu 0x30(%rdx),%xmm0
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+
+.Ldone:
+ FRAME_END
+ RET
+
+.Lxorpart:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone
+
+SYM_FUNC_END(chacha_block_xor_ssse3)
+
+SYM_FUNC_START(hchacha_block_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: output (8 32-bit words)
+ # %edx: nrounds
+ FRAME_BEGIN
+
+ movdqu 0x00(%rdi),%xmm0
+ movdqu 0x10(%rdi),%xmm1
+ movdqu 0x20(%rdi),%xmm2
+ movdqu 0x30(%rdi),%xmm3
+
+ mov %edx,%r8d
+ call chacha_permute
+
+ movdqu %xmm0,0x00(%rsi)
+ movdqu %xmm3,0x10(%rsi)
+
+ FRAME_END
+ RET
+SYM_FUNC_END(hchacha_block_ssse3)
+
+SYM_FUNC_START(chacha_4block_xor_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four consecutive ChaCha blocks by loading the
+ # the state matrix in SSE registers four times. As we need some scratch
+ # registers, we save the first four registers on the stack. The
+ # algorithm performs each operation on the corresponding word of each
+ # state matrix, hence requires no word shuffling. For final XORing step
+ # we transpose the matrix by interleaving 32- and then 64-bit words,
+ # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
+ # done with the slightly better performing SSSE3 byte shuffling,
+ # 7/12-bit word rotation uses traditional shift+OR.
+
+ lea 8(%rsp),%r10
+ sub $0x80,%rsp
+ and $~63,%rsp
+ mov %rcx,%rax
+
+ # x0..15[0-3] = s0..3[0..3]
+ movq 0x00(%rdi),%xmm1
+ pshufd $0x00,%xmm1,%xmm0
+ pshufd $0x55,%xmm1,%xmm1
+ movq 0x08(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ movq 0x10(%rdi),%xmm5
+ pshufd $0x00,%xmm5,%xmm4
+ pshufd $0x55,%xmm5,%xmm5
+ movq 0x18(%rdi),%xmm7
+ pshufd $0x00,%xmm7,%xmm6
+ pshufd $0x55,%xmm7,%xmm7
+ movq 0x20(%rdi),%xmm9
+ pshufd $0x00,%xmm9,%xmm8
+ pshufd $0x55,%xmm9,%xmm9
+ movq 0x28(%rdi),%xmm11
+ pshufd $0x00,%xmm11,%xmm10
+ pshufd $0x55,%xmm11,%xmm11
+ movq 0x30(%rdi),%xmm13
+ pshufd $0x00,%xmm13,%xmm12
+ pshufd $0x55,%xmm13,%xmm13
+ movq 0x38(%rdi),%xmm15
+ pshufd $0x00,%xmm15,%xmm14
+ pshufd $0x55,%xmm15,%xmm15
+ # x0..3 on stack
+ movdqa %xmm0,0x00(%rsp)
+ movdqa %xmm1,0x10(%rsp)
+ movdqa %xmm2,0x20(%rsp)
+ movdqa %xmm3,0x30(%rsp)
+
+ movdqa CTRINC(%rip),%xmm1
+ movdqa ROT8(%rip),%xmm2
+ movdqa ROT16(%rip),%xmm3
+
+ # x12 += counter values 0-3
+ paddd %xmm1,%xmm12
+
+.Ldoubleround4:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm3,%xmm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm3,%xmm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm3,%xmm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm3,%xmm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm4
+ por %xmm0,%xmm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm5
+ por %xmm0,%xmm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm6
+ por %xmm0,%xmm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm7
+ por %xmm0,%xmm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm2,%xmm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm2,%xmm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm2,%xmm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm2,%xmm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm4
+ por %xmm0,%xmm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm5
+ por %xmm0,%xmm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm6
+ por %xmm0,%xmm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm7
+ por %xmm0,%xmm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm3,%xmm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm3,%xmm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm3,%xmm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm3,%xmm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ paddd %xmm15,%xmm10
+ pxor %xmm10,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm5
+ por %xmm0,%xmm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ paddd %xmm12,%xmm11
+ pxor %xmm11,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm6
+ por %xmm0,%xmm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ paddd %xmm13,%xmm8
+ pxor %xmm8,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm7
+ por %xmm0,%xmm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ paddd %xmm14,%xmm9
+ pxor %xmm9,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm4
+ por %xmm0,%xmm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm2,%xmm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm2,%xmm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm2,%xmm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm2,%xmm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ paddd %xmm15,%xmm10
+ pxor %xmm10,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm5
+ por %xmm0,%xmm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ paddd %xmm12,%xmm11
+ pxor %xmm11,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm6
+ por %xmm0,%xmm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ paddd %xmm13,%xmm8
+ pxor %xmm8,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm7
+ por %xmm0,%xmm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ paddd %xmm14,%xmm9
+ pxor %xmm9,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm4
+ por %xmm0,%xmm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # x0[0-3] += s0[0]
+ # x1[0-3] += s0[1]
+ movq 0x00(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd 0x00(%rsp),%xmm2
+ movdqa %xmm2,0x00(%rsp)
+ paddd 0x10(%rsp),%xmm3
+ movdqa %xmm3,0x10(%rsp)
+ # x2[0-3] += s0[2]
+ # x3[0-3] += s0[3]
+ movq 0x08(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd 0x20(%rsp),%xmm2
+ movdqa %xmm2,0x20(%rsp)
+ paddd 0x30(%rsp),%xmm3
+ movdqa %xmm3,0x30(%rsp)
+
+ # x4[0-3] += s1[0]
+ # x5[0-3] += s1[1]
+ movq 0x10(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ # x6[0-3] += s1[2]
+ # x7[0-3] += s1[3]
+ movq 0x18(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+
+ # x8[0-3] += s2[0]
+ # x9[0-3] += s2[1]
+ movq 0x20(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm8
+ paddd %xmm3,%xmm9
+ # x10[0-3] += s2[2]
+ # x11[0-3] += s2[3]
+ movq 0x28(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm10
+ paddd %xmm3,%xmm11
+
+ # x12[0-3] += s3[0]
+ # x13[0-3] += s3[1]
+ movq 0x30(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm12
+ paddd %xmm3,%xmm13
+ # x14[0-3] += s3[2]
+ # x15[0-3] += s3[3]
+ movq 0x38(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm14
+ paddd %xmm3,%xmm15
+
+ # x12 += counter values 0-3
+ paddd %xmm1,%xmm12
+
+ # interleave 32-bit words in state n, n+1
+ movdqa 0x00(%rsp),%xmm0
+ movdqa 0x10(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpckldq %xmm1,%xmm2
+ punpckhdq %xmm1,%xmm0
+ movdqa %xmm2,0x00(%rsp)
+ movdqa %xmm0,0x10(%rsp)
+ movdqa 0x20(%rsp),%xmm0
+ movdqa 0x30(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpckldq %xmm1,%xmm2
+ punpckhdq %xmm1,%xmm0
+ movdqa %xmm2,0x20(%rsp)
+ movdqa %xmm0,0x30(%rsp)
+ movdqa %xmm4,%xmm0
+ punpckldq %xmm5,%xmm4
+ punpckhdq %xmm5,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm6,%xmm0
+ punpckldq %xmm7,%xmm6
+ punpckhdq %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ movdqa %xmm8,%xmm0
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm9,%xmm0
+ movdqa %xmm0,%xmm9
+ movdqa %xmm10,%xmm0
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm11,%xmm0
+ movdqa %xmm0,%xmm11
+ movdqa %xmm12,%xmm0
+ punpckldq %xmm13,%xmm12
+ punpckhdq %xmm13,%xmm0
+ movdqa %xmm0,%xmm13
+ movdqa %xmm14,%xmm0
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm15,%xmm0
+ movdqa %xmm0,%xmm15
+
+ # interleave 64-bit words in state n, n+2
+ movdqa 0x00(%rsp),%xmm0
+ movdqa 0x20(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpcklqdq %xmm1,%xmm2
+ punpckhqdq %xmm1,%xmm0
+ movdqa %xmm2,0x00(%rsp)
+ movdqa %xmm0,0x20(%rsp)
+ movdqa 0x10(%rsp),%xmm0
+ movdqa 0x30(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpcklqdq %xmm1,%xmm2
+ punpckhqdq %xmm1,%xmm0
+ movdqa %xmm2,0x10(%rsp)
+ movdqa %xmm0,0x30(%rsp)
+ movdqa %xmm4,%xmm0
+ punpcklqdq %xmm6,%xmm4
+ punpckhqdq %xmm6,%xmm0
+ movdqa %xmm0,%xmm6
+ movdqa %xmm5,%xmm0
+ punpcklqdq %xmm7,%xmm5
+ punpckhqdq %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ movdqa %xmm8,%xmm0
+ punpcklqdq %xmm10,%xmm8
+ punpckhqdq %xmm10,%xmm0
+ movdqa %xmm0,%xmm10
+ movdqa %xmm9,%xmm0
+ punpcklqdq %xmm11,%xmm9
+ punpckhqdq %xmm11,%xmm0
+ movdqa %xmm0,%xmm11
+ movdqa %xmm12,%xmm0
+ punpcklqdq %xmm14,%xmm12
+ punpckhqdq %xmm14,%xmm0
+ movdqa %xmm0,%xmm14
+ movdqa %xmm13,%xmm0
+ punpcklqdq %xmm15,%xmm13
+ punpckhqdq %xmm15,%xmm0
+ movdqa %xmm0,%xmm15
+
+ # xor with corresponding input, write to output
+ movdqa 0x00(%rsp),%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart4
+ movdqu 0x00(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x00(%rsi)
+
+ movdqu %xmm4,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ movdqu 0x10(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x10(%rsi)
+
+ movdqu %xmm8,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ movdqu 0x20(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x20(%rsi)
+
+ movdqu %xmm12,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ movdqu 0x30(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+
+ movdqa 0x20(%rsp),%xmm0
+ cmp $0x50,%rax
+ jl .Lxorpart4
+ movdqu 0x40(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x40(%rsi)
+
+ movdqu %xmm6,%xmm0
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ movdqu 0x50(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x50(%rsi)
+
+ movdqu %xmm10,%xmm0
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ movdqu 0x60(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x60(%rsi)
+
+ movdqu %xmm14,%xmm0
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ movdqu 0x70(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x70(%rsi)
+
+ movdqa 0x10(%rsp),%xmm0
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ movdqu 0x80(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x80(%rsi)
+
+ movdqu %xmm5,%xmm0
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ movdqu 0x90(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x90(%rsi)
+
+ movdqu %xmm9,%xmm0
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ movdqu 0xa0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xa0(%rsi)
+
+ movdqu %xmm13,%xmm0
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ movdqu 0xb0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xb0(%rsi)
+
+ movdqa 0x30(%rsp),%xmm0
+ cmp $0xd0,%rax
+ jl .Lxorpart4
+ movdqu 0xc0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xc0(%rsi)
+
+ movdqu %xmm7,%xmm0
+ cmp $0xe0,%rax
+ jl .Lxorpart4
+ movdqu 0xd0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xd0(%rsi)
+
+ movdqu %xmm11,%xmm0
+ cmp $0xf0,%rax
+ jl .Lxorpart4
+ movdqu 0xe0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xe0(%rsi)
+
+ movdqu %xmm15,%xmm0
+ cmp $0x100,%rax
+ jl .Lxorpart4
+ movdqu 0xf0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xf0(%rsi)
+
+.Ldone4:
+ lea -8(%r10),%rsp
+ RET
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_ssse3)
diff --git a/lib/crypto/x86/chacha.h b/lib/crypto/x86/chacha.h
new file mode 100644
index 000000000000..10cf8f1c569d
--- /dev/null
+++ b/lib/crypto/x86/chacha.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha and HChaCha functions (x86_64 optimized)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <asm/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+
+asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
+
+static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+{
+ len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+ return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ if (static_branch_likely(&chacha_use_avx512vl)) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
+ state->x[12] += 8;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 4) {
+ chacha_8block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state->x[12] += chacha_advance(bytes, 8);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 2) {
+ chacha_4block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state->x[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes) {
+ chacha_2block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state->x[12] += chacha_advance(bytes, 2);
+ return;
+ }
+ }
+
+ if (static_branch_likely(&chacha_use_avx2)) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
+ state->x[12] += 8;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 4) {
+ chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+ state->x[12] += chacha_advance(bytes, 8);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 2) {
+ chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
+ state->x[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE) {
+ chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
+ state->x[12] += chacha_advance(bytes, 2);
+ return;
+ }
+ }
+
+ while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+ chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 4;
+ src += CHACHA_BLOCK_SIZE * 4;
+ dst += CHACHA_BLOCK_SIZE * 4;
+ state->x[12] += 4;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE) {
+ chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+ state->x[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes) {
+ chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
+ state->x[12]++;
+ }
+}
+
+static void hchacha_block_arch(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+ if (!static_branch_likely(&chacha_use_simd)) {
+ hchacha_block_generic(state, out, nrounds);
+ } else {
+ kernel_fpu_begin();
+ hchacha_block_ssse3(state, out, nrounds);
+ kernel_fpu_end();
+ }
+}
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ if (!static_branch_likely(&chacha_use_simd) ||
+ bytes <= CHACHA_BLOCK_SIZE)
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_fpu_begin();
+ chacha_dosimd(state, dst, src, todo, nrounds);
+ kernel_fpu_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SSSE3))
+ return;
+
+ static_branch_enable(&chacha_use_simd);
+
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+ static_branch_enable(&chacha_use_avx2);
+
+ if (boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
+ static_branch_enable(&chacha_use_avx512vl);
+ }
+}
diff --git a/lib/crypto/x86/curve25519.h b/lib/crypto/x86/curve25519.h
new file mode 100644
index 000000000000..5c0b8408852d
--- /dev/null
+++ b/lib/crypto/x86/curve25519.h
@@ -0,0 +1,1613 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
+ */
+
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+static __always_inline u64 eq_mask(u64 a, u64 b)
+{
+ u64 x = a ^ b;
+ u64 minus_x = ~x + (u64)1U;
+ u64 x_or_minus_x = x | minus_x;
+ u64 xnx = x_or_minus_x >> (u32)63U;
+ return xnx - (u64)1U;
+}
+
+static __always_inline u64 gte_mask(u64 a, u64 b)
+{
+ u64 x = a;
+ u64 y = b;
+ u64 x_xor_y = x ^ y;
+ u64 x_sub_y = x - y;
+ u64 x_sub_y_xor_y = x_sub_y ^ y;
+ u64 q = x_xor_y | x_sub_y_xor_y;
+ u64 x_xor_q = x ^ q;
+ u64 x_xor_q_ = x_xor_q >> (u32)63U;
+ return x_xor_q_ - (u64)1U;
+}
+
+/* Computes the addition of four-element f1 with value in f2
+ * and returns the carry (if any) */
+static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
+{
+ u64 carry_r;
+
+ asm volatile(
+ /* Clear registers to propagate the carry bit */
+ " xor %%r8d, %%r8d;"
+ " xor %%r9d, %%r9d;"
+ " xor %%r10d, %%r10d;"
+ " xor %%r11d, %%r11d;"
+ " xor %k1, %k1;"
+
+ /* Begin addition chain */
+ " addq 0(%3), %0;"
+ " movq %0, 0(%2);"
+ " adcxq 8(%3), %%r8;"
+ " movq %%r8, 8(%2);"
+ " adcxq 16(%3), %%r9;"
+ " movq %%r9, 16(%2);"
+ " adcxq 24(%3), %%r10;"
+ " movq %%r10, 24(%2);"
+
+ /* Return the carry bit in a register */
+ " adcx %%r11, %1;"
+ : "+&r"(f2), "=&r"(carry_r)
+ : "r"(out), "r"(f1)
+ : "%r8", "%r9", "%r10", "%r11", "memory", "cc");
+
+ return carry_r;
+}
+
+/* Computes the field addition of two field elements */
+static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
+{
+ asm volatile(
+ /* Compute the raw addition of f1 + f2 */
+ " movq 0(%0), %%r8;"
+ " addq 0(%2), %%r8;"
+ " movq 8(%0), %%r9;"
+ " adcxq 8(%2), %%r9;"
+ " movq 16(%0), %%r10;"
+ " adcxq 16(%2), %%r10;"
+ " movq 24(%0), %%r11;"
+ " adcxq 24(%2), %%r11;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $0, %%rax;"
+ " mov $38, %0;"
+ " cmovc %0, %%rax;"
+
+ /* Step 2: Add carry*38 to the original sum */
+ " xor %%ecx, %%ecx;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %0, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+ : "+&r"(f2)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
+}
+
+/* Computes the field subtraction of two field elements */
+static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
+{
+ asm volatile(
+ /* Compute the raw subtraction of f1-f2 */
+ " movq 0(%1), %%r8;"
+ " subq 0(%2), %%r8;"
+ " movq 8(%1), %%r9;"
+ " sbbq 8(%2), %%r9;"
+ " movq 16(%1), %%r10;"
+ " sbbq 16(%2), %%r10;"
+ " movq 24(%1), %%r11;"
+ " sbbq 24(%2), %%r11;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $0, %%rax;"
+ " mov $38, %%rcx;"
+ " cmovc %%rcx, %%rax;"
+
+ /* Step 2: Subtract carry*38 from the original difference */
+ " sub %%rax, %%r8;"
+ " sbb $0, %%r9;"
+ " sbb $0, %%r10;"
+ " sbb $0, %%r11;"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rcx, %%rax;"
+ " sub %%rax, %%r8;"
+
+ /* Store the result */
+ " movq %%r8, 0(%0);"
+ " movq %%r9, 8(%0);"
+ " movq %%r10, 16(%0);"
+ " movq %%r11, 24(%0);"
+ :
+ : "r"(out), "r"(f1), "r"(f2)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
+}
+
+/* Computes a field multiplication: out <- f1 * f2
+ * Uses the 8-element buffer tmp for intermediate results */
+static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
+{
+ asm volatile(
+
+ /* Compute the raw multiplication: tmp <- src1 * src2 */
+
+ /* Compute src1[0] * src2 */
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
+ /* Compute src1[1] * src2 */
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[2] * src2 */
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[3] * src2 */
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
+
+ /* Line up pointers */
+ " mov %2, %0;"
+ " mov %3, %2;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
+}
+
+/* Computes two field multiplications:
+ * out[0] <- f1[0] * f2[0]
+ * out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results: */
+static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
+{
+ asm volatile(
+
+ /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
+
+ /* Compute src1[0] * src2 */
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
+ /* Compute src1[1] * src2 */
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[2] * src2 */
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[3] * src2 */
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
+
+ /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
+
+ /* Compute src1[0] * src2 */
+ " movq 32(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 64(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 72(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
+ /* Compute src1[1] * src2 */
+ " movq 40(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 72(%2), %%r8;"
+ " movq %%r8, 72(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 80(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[2] * src2 */
+ " movq 48(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 80(%2), %%r8;"
+ " movq %%r8, 80(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 88(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[3] * src2 */
+ " movq 56(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 88(%2), %%r8;"
+ " movq %%r8, 88(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 96(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 104(%2);"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 112(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 120(%2);"
+
+ /* Line up pointers */
+ " mov %2, %0;"
+ " mov %3, %2;"
+
+ /* Wrap the results back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%2);"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 96(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 88(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %1, %%r9;"
+ " movq %%r9, 40(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 48(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 56(%2);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 32(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
+}
+
+/* Computes the field multiplication of four-element f1 with value in f2
+ * Requires f2 to be smaller than 2^17 */
+static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
+{
+ register u64 f2_r asm("rdx") = f2;
+
+ asm volatile(
+ /* Compute the raw multiplication of f1*f2 */
+ " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
+ " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
+ " add %%rcx, %%r9;"
+ " mov $0, %%rcx;"
+ " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
+ " adcx %%rbx, %%r10;"
+ " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
+ " adcx %%r13, %%r11;"
+ " adcx %%rcx, %%rax;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $38, %%rdx;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+ : "+&r"(f2_r)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "memory", "cc");
+}
+
+/* Computes p1 <- bit ? p2 : p1 in constant time */
+static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
+{
+ asm volatile(
+ /* Transfer bit into CF flag */
+ " add $18446744073709551615, %0;"
+
+ /* cswap p1[0], p2[0] */
+ " movq 0(%1), %%r8;"
+ " movq 0(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 0(%1);"
+ " movq %%r9, 0(%2);"
+
+ /* cswap p1[1], p2[1] */
+ " movq 8(%1), %%r8;"
+ " movq 8(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 8(%1);"
+ " movq %%r9, 8(%2);"
+
+ /* cswap p1[2], p2[2] */
+ " movq 16(%1), %%r8;"
+ " movq 16(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 16(%1);"
+ " movq %%r9, 16(%2);"
+
+ /* cswap p1[3], p2[3] */
+ " movq 24(%1), %%r8;"
+ " movq 24(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 24(%1);"
+ " movq %%r9, 24(%2);"
+
+ /* cswap p1[4], p2[4] */
+ " movq 32(%1), %%r8;"
+ " movq 32(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 32(%1);"
+ " movq %%r9, 32(%2);"
+
+ /* cswap p1[5], p2[5] */
+ " movq 40(%1), %%r8;"
+ " movq 40(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 40(%1);"
+ " movq %%r9, 40(%2);"
+
+ /* cswap p1[6], p2[6] */
+ " movq 48(%1), %%r8;"
+ " movq 48(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 48(%1);"
+ " movq %%r9, 48(%2);"
+
+ /* cswap p1[7], p2[7] */
+ " movq 56(%1), %%r8;"
+ " movq 56(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 56(%1);"
+ " movq %%r9, 56(%2);"
+ : "+&r"(bit)
+ : "r"(p1), "r"(p2)
+ : "%r8", "%r9", "%r10", "memory", "cc");
+}
+
+/* Computes the square of a field element: out <- f * f
+ * Uses the 8-element buffer tmp for intermediate results */
+static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
+{
+ asm volatile(
+ /* Compute the raw multiplication: tmp <- f * f */
+
+ /* Step 1: Compute all partial products */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15d, %%r15d;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%rbx;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%rbx, %%rbx;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
+
+ /* Line up pointers */
+ " mov %1, %0;"
+ " mov %2, %1;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %%ecx, %%ecx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
+}
+
+/* Computes two field squarings:
+ * out[0] <- f[0] * f[0]
+ * out[1] <- f[1] * f[1]
+ * Uses the 16-element buffer tmp for intermediate results */
+static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
+{
+ asm volatile(
+ /* Step 1: Compute all partial products */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15d, %%r15d;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%rbx;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%rbx, %%rbx;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
+
+ /* Step 1: Compute all partial products */
+ " movq 32(%0), %%rdx;" /* f[0] */
+ " mulxq 40(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 48(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 56(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 56(%0), %%rdx;" /* f[3] */
+ " mulxq 40(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 48(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 40(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 48(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15d, %%r15d;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%rbx;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%rbx, %%rbx;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 32(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 64(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 72(%1);"
+ " movq 40(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 80(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 88(%1);"
+ " movq 48(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 96(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 104(%1);"
+ " movq 56(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 112(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 120(%1);"
+
+ /* Line up pointers */
+ " mov %1, %0;"
+ " mov %2, %1;"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %%ecx, %%ecx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 96(%0), %%r8, %%r13;"
+ " xor %%ecx, %%ecx;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 88(%0), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 40(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 48(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 56(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 32(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
+}
+
+static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
+{
+ u64 *nq = p01_tmp1;
+ u64 *nq_p1 = p01_tmp1 + (u32)8U;
+ u64 *tmp1 = p01_tmp1 + (u32)16U;
+ u64 *x1 = q;
+ u64 *x2 = nq;
+ u64 *z2 = nq + (u32)4U;
+ u64 *z3 = nq_p1 + (u32)4U;
+ u64 *a = tmp1;
+ u64 *b = tmp1 + (u32)4U;
+ u64 *ab = tmp1;
+ u64 *dc = tmp1 + (u32)8U;
+ u64 *x3;
+ u64 *z31;
+ u64 *d0;
+ u64 *c0;
+ u64 *a1;
+ u64 *b1;
+ u64 *d;
+ u64 *c;
+ u64 *ab1;
+ u64 *dc1;
+ fadd(a, x2, z2);
+ fsub(b, x2, z2);
+ x3 = nq_p1;
+ z31 = nq_p1 + (u32)4U;
+ d0 = dc;
+ c0 = dc + (u32)4U;
+ fadd(c0, x3, z31);
+ fsub(d0, x3, z31);
+ fmul2(dc, dc, ab, tmp2);
+ fadd(x3, d0, c0);
+ fsub(z31, d0, c0);
+ a1 = tmp1;
+ b1 = tmp1 + (u32)4U;
+ d = tmp1 + (u32)8U;
+ c = tmp1 + (u32)12U;
+ ab1 = tmp1;
+ dc1 = tmp1 + (u32)8U;
+ fsqr2(dc1, ab1, tmp2);
+ fsqr2(nq_p1, nq_p1, tmp2);
+ a1[0U] = c[0U];
+ a1[1U] = c[1U];
+ a1[2U] = c[2U];
+ a1[3U] = c[3U];
+ fsub(c, d, c);
+ fmul_scalar(b1, c, (u64)121665U);
+ fadd(b1, b1, d);
+ fmul2(nq, dc1, ab1, tmp2);
+ fmul(z3, z3, x1, tmp2);
+}
+
+static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
+{
+ u64 *x2 = nq;
+ u64 *z2 = nq + (u32)4U;
+ u64 *a = tmp1;
+ u64 *b = tmp1 + (u32)4U;
+ u64 *d = tmp1 + (u32)8U;
+ u64 *c = tmp1 + (u32)12U;
+ u64 *ab = tmp1;
+ u64 *dc = tmp1 + (u32)8U;
+ fadd(a, x2, z2);
+ fsub(b, x2, z2);
+ fsqr2(dc, ab, tmp2);
+ a[0U] = c[0U];
+ a[1U] = c[1U];
+ a[2U] = c[2U];
+ a[3U] = c[3U];
+ fsub(c, d, c);
+ fmul_scalar(b, c, (u64)121665U);
+ fadd(b, b, d);
+ fmul2(nq, dc, ab, tmp2);
+}
+
+static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
+{
+ u64 tmp2[16U] = { 0U };
+ u64 p01_tmp1_swap[33U] = { 0U };
+ u64 *p0 = p01_tmp1_swap;
+ u64 *p01 = p01_tmp1_swap;
+ u64 *p03 = p01;
+ u64 *p11 = p01 + (u32)8U;
+ u64 *x0;
+ u64 *z0;
+ u64 *p01_tmp1;
+ u64 *p01_tmp11;
+ u64 *nq10;
+ u64 *nq_p11;
+ u64 *swap1;
+ u64 sw0;
+ u64 *nq1;
+ u64 *tmp1;
+ memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
+ x0 = p03;
+ z0 = p03 + (u32)4U;
+ x0[0U] = (u64)1U;
+ x0[1U] = (u64)0U;
+ x0[2U] = (u64)0U;
+ x0[3U] = (u64)0U;
+ z0[0U] = (u64)0U;
+ z0[1U] = (u64)0U;
+ z0[2U] = (u64)0U;
+ z0[3U] = (u64)0U;
+ p01_tmp1 = p01_tmp1_swap;
+ p01_tmp11 = p01_tmp1_swap;
+ nq10 = p01_tmp1_swap;
+ nq_p11 = p01_tmp1_swap + (u32)8U;
+ swap1 = p01_tmp1_swap + (u32)32U;
+ cswap2((u64)1U, nq10, nq_p11);
+ point_add_and_double(init1, p01_tmp11, tmp2);
+ swap1[0U] = (u64)1U;
+ {
+ u32 i;
+ for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
+ u64 *p01_tmp12 = p01_tmp1_swap;
+ u64 *swap2 = p01_tmp1_swap + (u32)32U;
+ u64 *nq2 = p01_tmp12;
+ u64 *nq_p12 = p01_tmp12 + (u32)8U;
+ u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
+ u64 sw = swap2[0U] ^ bit;
+ cswap2(sw, nq2, nq_p12);
+ point_add_and_double(init1, p01_tmp12, tmp2);
+ swap2[0U] = bit;
+ }
+ }
+ sw0 = swap1[0U];
+ cswap2(sw0, nq10, nq_p11);
+ nq1 = p01_tmp1;
+ tmp1 = p01_tmp1 + (u32)16U;
+ point_double(nq1, tmp1, tmp2);
+ point_double(nq1, tmp1, tmp2);
+ point_double(nq1, tmp1, tmp2);
+ memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
+
+ memzero_explicit(tmp2, sizeof(tmp2));
+ memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
+}
+
+static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
+{
+ u32 i;
+ fsqr(o, inp, tmp);
+ for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
+ fsqr(o, o, tmp);
+}
+
+static void finv(u64 *o, const u64 *i, u64 *tmp)
+{
+ u64 t1[16U] = { 0U };
+ u64 *a0 = t1;
+ u64 *b = t1 + (u32)4U;
+ u64 *c = t1 + (u32)8U;
+ u64 *t00 = t1 + (u32)12U;
+ u64 *tmp1 = tmp;
+ u64 *a;
+ u64 *t0;
+ fsquare_times(a0, i, tmp1, (u32)1U);
+ fsquare_times(t00, a0, tmp1, (u32)2U);
+ fmul(b, t00, i, tmp);
+ fmul(a0, b, a0, tmp);
+ fsquare_times(t00, a0, tmp1, (u32)1U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)5U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)10U);
+ fmul(c, t00, b, tmp);
+ fsquare_times(t00, c, tmp1, (u32)20U);
+ fmul(t00, t00, c, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)10U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)50U);
+ fmul(c, t00, b, tmp);
+ fsquare_times(t00, c, tmp1, (u32)100U);
+ fmul(t00, t00, c, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)50U);
+ fmul(t00, t00, b, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)5U);
+ a = t1;
+ t0 = t1 + (u32)12U;
+ fmul(o, t0, a, tmp);
+}
+
+static void store_felem(u64 *b, u64 *f)
+{
+ u64 f30 = f[3U];
+ u64 top_bit0 = f30 >> (u32)63U;
+ u64 f31;
+ u64 top_bit;
+ u64 f0;
+ u64 f1;
+ u64 f2;
+ u64 f3;
+ u64 m0;
+ u64 m1;
+ u64 m2;
+ u64 m3;
+ u64 mask;
+ u64 f0_;
+ u64 f1_;
+ u64 f2_;
+ u64 f3_;
+ u64 o0;
+ u64 o1;
+ u64 o2;
+ u64 o3;
+ f[3U] = f30 & (u64)0x7fffffffffffffffU;
+ add_scalar(f, f, (u64)19U * top_bit0);
+ f31 = f[3U];
+ top_bit = f31 >> (u32)63U;
+ f[3U] = f31 & (u64)0x7fffffffffffffffU;
+ add_scalar(f, f, (u64)19U * top_bit);
+ f0 = f[0U];
+ f1 = f[1U];
+ f2 = f[2U];
+ f3 = f[3U];
+ m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
+ m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
+ m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
+ m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
+ mask = ((m0 & m1) & m2) & m3;
+ f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
+ f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
+ f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
+ f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
+ o0 = f0_;
+ o1 = f1_;
+ o2 = f2_;
+ o3 = f3_;
+ b[0U] = o0;
+ b[1U] = o1;
+ b[2U] = o2;
+ b[3U] = o3;
+}
+
+static void encode_point(u8 *o, const u64 *i)
+{
+ const u64 *x = i;
+ const u64 *z = i + (u32)4U;
+ u64 tmp[4U] = { 0U };
+ u64 tmp_w[16U] = { 0U };
+ finv(tmp, z, tmp_w);
+ fmul(tmp, tmp, x, tmp_w);
+ store_felem((u64 *)o, tmp);
+}
+
+static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
+{
+ u64 init1[8U] = { 0U };
+ u64 tmp[4U] = { 0U };
+ u64 tmp3;
+ u64 *x;
+ u64 *z;
+ {
+ u32 i;
+ for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
+ u64 *os = tmp;
+ const u8 *bj = pub + i * (u32)8U;
+ u64 u = *(u64 *)bj;
+ u64 r = u;
+ u64 x0 = r;
+ os[i] = x0;
+ }
+ }
+ tmp3 = tmp[3U];
+ tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
+ x = init1;
+ z = init1 + (u32)4U;
+ z[0U] = (u64)1U;
+ z[1U] = (u64)0U;
+ z[2U] = (u64)0U;
+ z[3U] = (u64)0U;
+ x[0U] = tmp[0U];
+ x[1U] = tmp[1U];
+ x[2U] = tmp[2U];
+ x[3U] = tmp[3U];
+ montgomery_ladder(init1, priv, init1);
+ encode_point(out, init1);
+}
+
+/* The below constants were generated using this sage script:
+ *
+ * #!/usr/bin/env sage
+ * import sys
+ * from sage.all import *
+ * def limbs(n):
+ * n = int(n)
+ * l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
+ * return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
+ * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
+ * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
+ * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
+ * print("static const u64 table_ladder[] = {")
+ * p = ec.lift_x(9)
+ * for i in range(252):
+ * l = (p[0] + p[2]) / (p[0] - p[2])
+ * print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
+ * p = p * 2
+ * print("};")
+ *
+ */
+
+static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
+
+static const u64 table_ladder[] = {
+ 0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
+ 0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
+ 0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
+ 0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
+ 0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
+ 0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
+ 0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
+ 0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
+ 0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
+ 0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
+ 0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
+ 0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
+ 0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
+ 0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
+ 0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
+ 0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
+ 0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
+ 0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
+ 0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
+ 0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
+ 0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
+ 0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
+ 0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
+ 0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
+ 0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
+ 0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
+ 0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
+ 0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
+ 0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
+ 0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
+ 0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
+ 0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
+ 0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
+ 0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
+ 0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
+ 0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
+ 0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
+ 0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
+ 0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
+ 0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
+ 0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
+ 0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
+ 0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
+ 0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
+ 0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
+ 0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
+ 0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
+ 0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
+ 0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
+ 0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
+ 0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
+ 0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
+ 0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
+ 0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
+ 0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
+ 0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
+ 0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
+ 0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
+ 0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
+ 0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
+ 0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
+ 0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
+ 0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
+ 0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
+ 0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
+ 0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
+ 0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
+ 0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
+ 0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
+ 0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
+ 0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
+ 0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
+ 0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
+ 0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
+ 0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
+ 0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
+ 0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
+ 0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
+ 0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
+ 0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
+ 0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
+ 0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
+ 0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
+ 0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
+ 0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
+ 0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
+ 0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
+ 0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
+ 0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
+ 0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
+ 0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
+ 0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
+ 0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
+ 0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
+ 0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
+ 0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
+ 0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
+ 0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
+ 0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
+ 0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
+ 0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
+ 0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
+ 0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
+ 0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
+ 0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
+ 0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
+ 0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
+ 0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
+ 0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
+ 0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
+ 0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
+ 0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
+ 0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
+ 0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
+ 0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
+ 0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
+ 0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
+ 0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
+ 0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
+ 0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
+ 0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
+ 0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
+ 0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
+ 0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
+ 0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
+ 0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
+ 0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
+ 0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
+ 0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
+ 0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
+ 0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
+ 0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
+ 0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
+ 0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
+ 0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
+ 0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
+ 0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
+ 0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
+ 0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
+ 0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
+ 0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
+ 0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
+ 0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
+ 0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
+ 0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
+ 0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
+ 0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
+ 0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
+ 0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
+ 0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
+ 0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
+ 0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
+ 0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
+ 0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
+ 0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
+ 0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
+ 0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
+ 0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
+ 0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
+ 0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
+ 0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
+ 0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
+ 0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
+ 0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
+ 0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
+ 0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
+ 0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
+ 0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
+ 0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
+ 0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
+ 0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
+ 0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
+ 0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
+ 0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
+ 0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
+ 0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
+ 0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
+ 0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
+ 0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
+ 0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
+ 0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
+ 0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
+ 0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
+ 0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
+ 0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
+ 0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
+ 0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
+ 0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
+ 0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
+ 0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
+ 0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
+ 0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
+ 0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
+ 0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
+ 0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
+ 0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
+ 0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
+ 0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
+ 0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
+ 0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
+ 0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
+ 0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
+ 0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
+ 0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
+ 0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
+ 0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
+ 0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
+ 0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
+ 0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
+ 0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
+ 0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
+ 0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
+ 0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
+ 0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
+ 0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
+ 0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
+ 0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
+ 0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
+ 0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
+ 0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
+ 0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
+ 0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
+ 0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
+ 0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
+ 0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
+ 0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
+ 0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
+ 0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
+ 0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
+ 0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
+ 0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
+ 0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
+ 0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
+ 0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
+ 0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
+ 0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
+ 0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
+ 0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
+ 0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
+ 0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
+ 0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
+ 0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
+ 0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
+ 0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
+ 0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
+ 0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
+ 0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
+ 0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
+ 0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
+ 0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
+ 0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
+ 0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
+};
+
+static void curve25519_ever64_base(u8 *out, const u8 *priv)
+{
+ u64 swap = 1;
+ int i, j, k;
+ u64 tmp[16 + 32 + 4];
+ u64 *x1 = &tmp[0];
+ u64 *z1 = &tmp[4];
+ u64 *x2 = &tmp[8];
+ u64 *z2 = &tmp[12];
+ u64 *xz1 = &tmp[0];
+ u64 *xz2 = &tmp[8];
+ u64 *a = &tmp[0 + 16];
+ u64 *b = &tmp[4 + 16];
+ u64 *c = &tmp[8 + 16];
+ u64 *ab = &tmp[0 + 16];
+ u64 *abcd = &tmp[0 + 16];
+ u64 *ef = &tmp[16 + 16];
+ u64 *efgh = &tmp[16 + 16];
+ u64 *key = &tmp[0 + 16 + 32];
+
+ memcpy(key, priv, 32);
+ ((u8 *)key)[0] &= 248;
+ ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
+
+ x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
+ z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
+ z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
+ memcpy(x2, p_minus_s, sizeof(p_minus_s));
+
+ j = 3;
+ for (i = 0; i < 4; ++i) {
+ while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
+ u64 bit = (key[i] >> j) & 1;
+ k = (64 * i + j - 3);
+ swap = swap ^ bit;
+ cswap2(swap, xz1, xz2);
+ swap = bit;
+ fsub(b, x1, z1);
+ fadd(a, x1, z1);
+ fmul(c, &table_ladder[4 * k], b, ef);
+ fsub(b, a, c);
+ fadd(a, a, c);
+ fsqr2(ab, ab, efgh);
+ fmul2(xz1, xz2, ab, efgh);
+ ++j;
+ }
+ j = 0;
+ }
+
+ point_double(xz1, abcd, efgh);
+ point_double(xz1, abcd, efgh);
+ point_double(xz1, abcd, efgh);
+ encode_point(out, xz1);
+
+ memzero_explicit(tmp, sizeof(tmp));
+}
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
+
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+ if (static_branch_likely(&curve25519_use_bmi2_adx))
+ curve25519_ever64(mypublic, secret, basepoint);
+ else
+ curve25519_generic(mypublic, secret, basepoint);
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ if (static_branch_likely(&curve25519_use_bmi2_adx))
+ curve25519_ever64_base(pub, secret);
+ else
+ curve25519_generic(pub, secret, curve25519_base_point);
+}
+
+#define curve25519_mod_init_arch curve25519_mod_init_arch
+static void curve25519_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
+ static_branch_enable(&curve25519_use_bmi2_adx);
+}
diff --git a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
new file mode 100644
index 000000000000..409ec6955733
--- /dev/null
+++ b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
@@ -0,0 +1,4240 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for x86_64.
+#
+# March 2015
+#
+# Initial release.
+#
+# December 2016
+#
+# Add AVX512F+VL+BW code path.
+#
+# November 2017
+#
+# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
+# executed even on Knights Landing. Trigger for modification was
+# observation that AVX512 code paths can negatively affect overall
+# Skylake-X system performance. Since we are likely to suppress
+# AVX512F capability flag [at least on Skylake-X], conversion serves
+# as kind of "investment protection". Note that next *lake processor,
+# Cannonlake, has AVX512IFMA code path to execute...
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone,
+# measured with rdtsc at fixed clock frequency.
+#
+# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
+# P4 4.46/+120% -
+# Core 2 2.41/+90% -
+# Westmere 1.88/+120% -
+# Sandy Bridge 1.39/+140% 1.10
+# Haswell 1.14/+175% 1.11 0.65
+# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
+# Silvermont 2.83/+95% -
+# Knights L 3.60/? 1.65 1.10 0.41(***)
+# Goldmont 1.70/+180% -
+# VIA Nano 1.82/+150% -
+# Sledgehammer 1.38/+160% -
+# Bulldozer 2.30/+130% 0.97
+# Ryzen 1.15/+200% 1.08 1.18
+#
+# (*) improvement coefficients relative to clang are more modest and
+# are ~50% on most processors, in both cases we are comparing to
+# __int128 code;
+# (**) SSE2 implementation was attempted, but among non-AVX processors
+# it was faster than integer-only code only on older Intel P4 and
+# Core processors, 50-30%, less newer processor is, but slower on
+# contemporary ones, for example almost 2x slower on Atom, and as
+# former are naturally disappearing, SSE2 is deemed unnecessary;
+# (***) strangely enough performance seems to vary from core to core,
+# listed result is best case;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$kernel=0; $kernel=1 if (!$flavour && !$output);
+
+if (!$kernel) {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+ die "can't locate x86_64-xlate.pl";
+
+ open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+ *STDOUT=*OUT;
+
+ if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
+ }
+
+ if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+ $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
+ $avx += 1 if ($1==2.11 && $2>=8);
+ }
+
+ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+ }
+
+ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+ }
+} else {
+ $avx = 4; # The kernel uses ifdefs for this.
+}
+
+sub declare_function() {
+ my ($name, $align, $nargs) = @_;
+ if($kernel) {
+ $code .= "SYM_FUNC_START($name)\n";
+ $code .= ".L$name:\n";
+ } else {
+ $code .= ".globl $name\n";
+ $code .= ".type $name,\@function,$nargs\n";
+ $code .= ".align $align\n";
+ $code .= "$name:\n";
+ }
+}
+
+sub end_function() {
+ my ($name) = @_;
+ if($kernel) {
+ $code .= "SYM_FUNC_END($name)\n";
+ } else {
+ $code .= ".size $name,.-$name\n";
+ }
+}
+
+$code.=<<___ if $kernel;
+#include <linux/linkage.h>
+___
+
+if ($avx) {
+$code.=<<___ if $kernel;
+.section .rodata
+___
+$code.=<<___;
+.align 64
+.Lconst:
+.Lmask24:
+.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
+.Lmask26:
+.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long 2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long 0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad 0,12,24,64
+.L2_44_mask:
+.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad 44,44,42,64
+.L2_44_shift_lft:
+.quad 8,8,10,64
+
+.align 64
+.Lx_mask44:
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+___
+}
+$code.=<<___ if (!$kernel);
+.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
+my ($mac,$nonce)=($inp,$len); # *_emit arguments
+my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
+my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
+
+sub poly1305_iteration {
+# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
+# output: $h0-$h2 *= $r0-$r1
+$code.=<<___;
+ mulq $h0 # h0*r1
+ mov %rax,$d2
+ mov $r0,%rax
+ mov %rdx,$d3
+
+ mulq $h0 # h0*r0
+ mov %rax,$h0 # future $h0
+ mov $r0,%rax
+ mov %rdx,$d1
+
+ mulq $h1 # h1*r0
+ add %rax,$d2
+ mov $s1,%rax
+ adc %rdx,$d3
+
+ mulq $h1 # h1*s1
+ mov $h2,$h1 # borrow $h1
+ add %rax,$h0
+ adc %rdx,$d1
+
+ imulq $s1,$h1 # h2*s1
+ add $h1,$d2
+ mov $d1,$h1
+ adc \$0,$d3
+
+ imulq $r0,$h2 # h2*r0
+ add $d2,$h1
+ mov \$-4,%rax # mask value
+ adc $h2,$d3
+
+ and $d3,%rax # last reduction step
+ mov $d3,$h2
+ shr \$2,$d3
+ and \$3,$h2
+ add $d3,%rax
+ add %rax,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+___
+}
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int64 h[3]; # current hash value base 2^64
+# unsigned __int64 r[2]; # key value base 2^64
+
+$code.=<<___;
+.text
+___
+$code.=<<___ if (!$kernel);
+.extern OPENSSL_ia32cap_P
+
+.globl poly1305_init_x86_64
+.hidden poly1305_init_x86_64
+.globl poly1305_blocks_x86_64
+.hidden poly1305_blocks_x86_64
+.globl poly1305_emit_x86_64
+.hidden poly1305_emit_x86_64
+___
+&declare_function("poly1305_init_x86_64", 32, 3);
+$code.=<<___;
+ xor %eax,%eax
+ mov %rax,0($ctx) # initialize hash value
+ mov %rax,8($ctx)
+ mov %rax,16($ctx)
+
+ test $inp,$inp
+ je .Lno_key
+___
+$code.=<<___ if (!$kernel);
+ lea poly1305_blocks_x86_64(%rip),%r10
+ lea poly1305_emit_x86_64(%rip),%r11
+___
+$code.=<<___ if (!$kernel && $avx);
+ mov OPENSSL_ia32cap_P+4(%rip),%r9
+ lea poly1305_blocks_avx(%rip),%rax
+ lea poly1305_emit_avx(%rip),%rcx
+ bt \$`60-32`,%r9 # AVX?
+ cmovc %rax,%r10
+ cmovc %rcx,%r11
+___
+$code.=<<___ if (!$kernel && $avx>1);
+ lea poly1305_blocks_avx2(%rip),%rax
+ bt \$`5+32`,%r9 # AVX2?
+ cmovc %rax,%r10
+___
+$code.=<<___ if (!$kernel && $avx>3);
+ mov \$`(1<<31|1<<21|1<<16)`,%rax
+ shr \$32,%r9
+ and %rax,%r9
+ cmp %rax,%r9
+ je .Linit_base2_44
+___
+$code.=<<___;
+ mov \$0x0ffffffc0fffffff,%rax
+ mov \$0x0ffffffc0ffffffc,%rcx
+ and 0($inp),%rax
+ and 8($inp),%rcx
+ mov %rax,24($ctx)
+ mov %rcx,32($ctx)
+___
+$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
+ mov %r10,0(%rdx)
+ mov %r11,8(%rdx)
+___
+$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+___
+$code.=<<___;
+ mov \$1,%eax
+.Lno_key:
+ RET
+___
+&end_function("poly1305_init_x86_64");
+
+&declare_function("poly1305_blocks_x86_64", 32, 4);
+$code.=<<___;
+.cfi_startproc
+.Lblocks:
+ shr \$4,$len
+ jz .Lno_data # too short
+
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+ push $ctx
+.cfi_push $ctx
+.Lblocks_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2
+
+ mov $s1,$r1
+ shr \$2,$s1
+ mov $r1,%rax
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+ jmp .Loop
+
+.align 32
+.Loop:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+___
+
+ &poly1305_iteration();
+
+$code.=<<___;
+ mov $r1,%rax
+ dec %r15 # len-=16
+ jnz .Loop
+
+ mov 0(%rsp),$ctx
+.cfi_restore $ctx
+
+ mov $h0,0($ctx) # store hash value
+ mov $h1,8($ctx)
+ mov $h2,16($ctx)
+
+ mov 8(%rsp),%r15
+.cfi_restore %r15
+ mov 16(%rsp),%r14
+.cfi_restore %r14
+ mov 24(%rsp),%r13
+.cfi_restore %r13
+ mov 32(%rsp),%r12
+.cfi_restore %r12
+ mov 40(%rsp),%rbx
+.cfi_restore %rbx
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lno_data:
+.Lblocks_epilogue:
+ RET
+.cfi_endproc
+___
+&end_function("poly1305_blocks_x86_64");
+
+&declare_function("poly1305_emit_x86_64", 32, 3);
+$code.=<<___;
+.Lemit:
+ mov 0($ctx),%r8 # load hash value
+ mov 8($ctx),%r9
+ mov 16($ctx),%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overflow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ RET
+___
+&end_function("poly1305_emit_x86_64");
+if ($avx) {
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int32 h[5]; # current hash value base 2^26
+# unsigned __int32 is_base2_26;
+# unsigned __int64 r[2]; # key value base 2^64
+# unsigned __int64 pad;
+# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
+#
+# where r^n are base 2^26 digits of degrees of multiplier key. There are
+# 5 digits, but last four are interleaved with multiples of 5, totalling
+# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
+
+my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
+ map("%xmm$_",(0..15));
+
+$code.=<<___;
+.type __poly1305_block,\@abi-omnipotent
+.align 32
+__poly1305_block:
+ push $ctx
+___
+ &poly1305_iteration();
+$code.=<<___;
+ pop $ctx
+ RET
+.size __poly1305_block,.-__poly1305_block
+
+.type __poly1305_init_avx,\@abi-omnipotent
+.align 32
+__poly1305_init_avx:
+ push %rbp
+ mov %rsp,%rbp
+ mov $r0,$h0
+ mov $r1,$h1
+ xor $h2,$h2
+
+ lea 48+64($ctx),$ctx # size optimization
+
+ mov $r1,%rax
+ call __poly1305_block # r^2
+
+ mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
+ mov \$0x3ffffff,%edx
+ mov $h0,$d1
+ and $h0#d,%eax
+ mov $r0,$d2
+ and $r0#d,%edx
+ mov %eax,`16*0+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*0+4-64`($ctx)
+ shr \$26,$d2
+
+ mov \$0x3ffffff,%eax
+ mov \$0x3ffffff,%edx
+ and $d1#d,%eax
+ and $d2#d,%edx
+ mov %eax,`16*1+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*1+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*2+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*2+4-64`($ctx)
+ shr \$26,$d2
+
+ mov $h1,%rax
+ mov $r1,%rdx
+ shl \$12,%rax
+ shl \$12,%rdx
+ or $d1,%rax
+ or $d2,%rdx
+ and \$0x3ffffff,%eax
+ and \$0x3ffffff,%edx
+ mov %eax,`16*3+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*3+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*4+0-64`($ctx)
+ mov $h1,$d1
+ mov %edx,`16*4+4-64`($ctx)
+ mov $r1,$d2
+
+ mov \$0x3ffffff,%eax
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ shr \$14,$d2
+ and $d1#d,%eax
+ and $d2#d,%edx
+ mov %eax,`16*5+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*5+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*6+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*6+4-64`($ctx)
+ shr \$26,$d2
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+0-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d2#d,`16*7+4-64`($ctx)
+ lea ($d2,$d2,4),$d2 # *5
+ mov $d1#d,`16*8+0-64`($ctx)
+ mov $d2#d,`16*8+4-64`($ctx)
+
+ mov $r1,%rax
+ call __poly1305_block # r^3
+
+ mov \$0x3ffffff,%eax # save r^3 base 2^26
+ mov $h0,$d1
+ and $h0#d,%eax
+ shr \$26,$d1
+ mov %eax,`16*0+12-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ and $d1#d,%edx
+ mov %edx,`16*1+12-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*2+12-64`($ctx)
+
+ mov $h1,%rax
+ shl \$12,%rax
+ or $d1,%rax
+ and \$0x3ffffff,%eax
+ mov %eax,`16*3+12-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov $h1,$d1
+ mov %eax,`16*4+12-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ and $d1#d,%edx
+ mov %edx,`16*5+12-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*6+12-64`($ctx)
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+12-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d1#d,`16*8+12-64`($ctx)
+
+ mov $r1,%rax
+ call __poly1305_block # r^4
+
+ mov \$0x3ffffff,%eax # save r^4 base 2^26
+ mov $h0,$d1
+ and $h0#d,%eax
+ shr \$26,$d1
+ mov %eax,`16*0+8-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ and $d1#d,%edx
+ mov %edx,`16*1+8-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*2+8-64`($ctx)
+
+ mov $h1,%rax
+ shl \$12,%rax
+ or $d1,%rax
+ and \$0x3ffffff,%eax
+ mov %eax,`16*3+8-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov $h1,$d1
+ mov %eax,`16*4+8-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ and $d1#d,%edx
+ mov %edx,`16*5+8-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*6+8-64`($ctx)
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+8-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d1#d,`16*8+8-64`($ctx)
+
+ lea -48-64($ctx),$ctx # size [de-]optimization
+ pop %rbp
+ RET
+.size __poly1305_init_avx,.-__poly1305_init_avx
+___
+
+&declare_function("poly1305_blocks_avx", 32, 4);
+$code.=<<___;
+.cfi_startproc
+ mov 20($ctx),%r8d # is_base2_26
+ cmp \$128,$len
+ jae .Lblocks_avx
+ test %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx:
+ and \$-16,$len
+ jz .Lno_data_avx
+
+ vzeroupper
+
+ test %r8d,%r8d
+ jz .Lbase2_64_avx
+
+ test \$31,$len
+ jz .Leven_avx
+
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lblocks_avx_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 0($ctx),$d1 # load hash value
+ mov 8($ctx),$d2
+ mov 16($ctx),$h2#d
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ ################################# base 2^26 -> base 2^64
+ mov $d1#d,$h0#d
+ and \$`-1*(1<<31)`,$d1
+ mov $d2,$r1 # borrow $r1
+ mov $d2#d,$h1#d
+ and \$`-1*(1<<31)`,$d2
+
+ shr \$6,$d1
+ shl \$52,$r1
+ add $d1,$h0
+ shr \$12,$h1
+ shr \$18,$d2
+ add $r1,$h0
+ adc $d2,$h1
+
+ mov $h2,$d1
+ shl \$40,$d1
+ shr \$24,$h2
+ add $d1,$h1
+ adc \$0,$h2 # can be partially reduced...
+
+ mov \$-4,$d2 # ... so reduce
+ mov $h2,$d1
+ and $h2,$d2
+ shr \$2,$d1
+ and \$3,$h2
+ add $d2,$d1 # =*5
+ add $d1,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+
+ call __poly1305_block
+
+ test $padbit,$padbit # if $padbit is zero,
+ jz .Lstore_base2_64_avx # store hash in base 2^64 format
+
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$r0
+ mov $h1,$r1
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$r0
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $r0,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$r1
+ and \$0x3ffffff,$h1 # h[3]
+ or $r1,$h2 # h[4]
+
+ sub \$16,%r15
+ jz .Lstore_base2_26_avx
+
+ vmovd %rax#d,$H0
+ vmovd %rdx#d,$H1
+ vmovd $h0#d,$H2
+ vmovd $h1#d,$H3
+ vmovd $h2#d,$H4
+ jmp .Lproceed_avx
+
+.align 32
+.Lstore_base2_64_avx:
+ mov $h0,0($ctx)
+ mov $h1,8($ctx)
+ mov $h2,16($ctx) # note that is_base2_26 is zeroed
+ jmp .Ldone_avx
+
+.align 16
+.Lstore_base2_26_avx:
+ mov %rax#d,0($ctx) # store hash value base 2^26
+ mov %rdx#d,4($ctx)
+ mov $h0#d,8($ctx)
+ mov $h1#d,12($ctx)
+ mov $h2#d,16($ctx)
+.align 16
+.Ldone_avx:
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+ RET
+.cfi_endproc
+
+.align 32
+.Lbase2_64_avx:
+.cfi_startproc
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lbase2_64_avx_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2#d
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ test \$31,$len
+ jz .Linit_avx
+
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+
+.Linit_avx:
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$d1
+ mov $h1,$d2
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$d1
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $d1,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$d2
+ and \$0x3ffffff,$h1 # h[3]
+ or $d2,$h2 # h[4]
+
+ vmovd %rax#d,$H0
+ vmovd %rdx#d,$H1
+ vmovd $h0#d,$H2
+ vmovd $h1#d,$H3
+ vmovd $h2#d,$H4
+ movl \$1,20($ctx) # set is_base2_26
+
+ call __poly1305_init_avx
+
+.Lproceed_avx:
+ mov %r15,$len
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lbase2_64_avx_epilogue:
+ jmp .Ldo_avx
+.cfi_endproc
+
+.align 32
+.Leven_avx:
+.cfi_startproc
+ vmovd 4*0($ctx),$H0 # load hash value
+ vmovd 4*1($ctx),$H1
+ vmovd 4*2($ctx),$H2
+ vmovd 4*3($ctx),$H3
+ vmovd 4*4($ctx),$H4
+
+.Ldo_avx:
+___
+$code.=<<___ if (!$win64);
+ lea 8(%rsp),%r10
+.cfi_def_cfa_register %r10
+ and \$-32,%rsp
+ sub \$-8,%rsp
+ lea -0x58(%rsp),%r11
+ sub \$0x178,%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xf8(%rsp),%r11
+ sub \$0x218,%rsp
+ vmovdqa %xmm6,0x50(%r11)
+ vmovdqa %xmm7,0x60(%r11)
+ vmovdqa %xmm8,0x70(%r11)
+ vmovdqa %xmm9,0x80(%r11)
+ vmovdqa %xmm10,0x90(%r11)
+ vmovdqa %xmm11,0xa0(%r11)
+ vmovdqa %xmm12,0xb0(%r11)
+ vmovdqa %xmm13,0xc0(%r11)
+ vmovdqa %xmm14,0xd0(%r11)
+ vmovdqa %xmm15,0xe0(%r11)
+.Ldo_avx_body:
+___
+$code.=<<___;
+ sub \$64,$len
+ lea -32($inp),%rax
+ cmovc %rax,$inp
+
+ vmovdqu `16*3`($ctx),$D4 # preload r0^2
+ lea `16*3+64`($ctx),$ctx # size optimization
+ lea .Lconst(%rip),%rcx
+
+ ################################################################
+ # load input
+ vmovdqu 16*2($inp),$T0
+ vmovdqu 16*3($inp),$T1
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpsrldq \$6,$T1,$T3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+
+ vpsrlq \$40,$T4,$T4 # 4
+ vpsrlq \$26,$T0,$T1
+ vpand $MASK,$T0,$T0 # 0
+ vpsrlq \$4,$T3,$T2
+ vpand $MASK,$T1,$T1 # 1
+ vpsrlq \$30,$T3,$T3
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ jbe .Lskip_loop_avx
+
+ # expand and copy pre-calculated table to stack
+ vmovdqu `16*1-64`($ctx),$D1
+ vmovdqu `16*2-64`($ctx),$D2
+ vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
+ vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
+ vmovdqa $D3,-0x90(%r11)
+ vmovdqa $D0,0x00(%rsp)
+ vpshufd \$0xEE,$D1,$D4
+ vmovdqu `16*3-64`($ctx),$D0
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D4,-0x80(%r11)
+ vmovdqa $D1,0x10(%rsp)
+ vpshufd \$0xEE,$D2,$D3
+ vmovdqu `16*4-64`($ctx),$D1
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D3,-0x70(%r11)
+ vmovdqa $D2,0x20(%rsp)
+ vpshufd \$0xEE,$D0,$D4
+ vmovdqu `16*5-64`($ctx),$D2
+ vpshufd \$0x44,$D0,$D0
+ vmovdqa $D4,-0x60(%r11)
+ vmovdqa $D0,0x30(%rsp)
+ vpshufd \$0xEE,$D1,$D3
+ vmovdqu `16*6-64`($ctx),$D0
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D3,-0x50(%r11)
+ vmovdqa $D1,0x40(%rsp)
+ vpshufd \$0xEE,$D2,$D4
+ vmovdqu `16*7-64`($ctx),$D1
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D4,-0x40(%r11)
+ vmovdqa $D2,0x50(%rsp)
+ vpshufd \$0xEE,$D0,$D3
+ vmovdqu `16*8-64`($ctx),$D2
+ vpshufd \$0x44,$D0,$D0
+ vmovdqa $D3,-0x30(%r11)
+ vmovdqa $D0,0x60(%rsp)
+ vpshufd \$0xEE,$D1,$D4
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D4,-0x20(%r11)
+ vmovdqa $D1,0x70(%rsp)
+ vpshufd \$0xEE,$D2,$D3
+ vmovdqa 0x00(%rsp),$D4 # preload r0^2
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D3,-0x10(%r11)
+ vmovdqa $D2,0x80(%rsp)
+
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+ ################################################################
+ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ # \___________________/
+ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ # \___________________/ \____________________/
+ #
+ # Note that we start with inp[2:3]*r^2. This is because it
+ # doesn't depend on reduction in previous iteration.
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ #
+ # though note that $Tx and $Hx are "reversed" in this section,
+ # and $D4 is preloaded with r0^2...
+
+ vpmuludq $T0,$D4,$D0 # d0 = h0*r0
+ vpmuludq $T1,$D4,$D1 # d1 = h1*r0
+ vmovdqa $H2,0x20(%r11) # offload hash
+ vpmuludq $T2,$D4,$D2 # d3 = h2*r0
+ vmovdqa 0x10(%rsp),$H2 # r1^2
+ vpmuludq $T3,$D4,$D3 # d3 = h3*r0
+ vpmuludq $T4,$D4,$D4 # d4 = h4*r0
+
+ vmovdqa $H0,0x00(%r11) #
+ vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
+ vmovdqa $H1,0x10(%r11) #
+ vpmuludq $T3,$H2,$H1 # h3*r1
+ vpaddq $H0,$D0,$D0 # d0 += h4*s1
+ vpaddq $H1,$D4,$D4 # d4 += h3*r1
+ vmovdqa $H3,0x30(%r11) #
+ vpmuludq $T2,$H2,$H0 # h2*r1
+ vpmuludq $T1,$H2,$H1 # h1*r1
+ vpaddq $H0,$D3,$D3 # d3 += h2*r1
+ vmovdqa 0x30(%rsp),$H3 # r2^2
+ vpaddq $H1,$D2,$D2 # d2 += h1*r1
+ vmovdqa $H4,0x40(%r11) #
+ vpmuludq $T0,$H2,$H2 # h0*r1
+ vpmuludq $T2,$H3,$H0 # h2*r2
+ vpaddq $H2,$D1,$D1 # d1 += h0*r1
+
+ vmovdqa 0x40(%rsp),$H4 # s2^2
+ vpaddq $H0,$D4,$D4 # d4 += h2*r2
+ vpmuludq $T1,$H3,$H1 # h1*r2
+ vpmuludq $T0,$H3,$H3 # h0*r2
+ vpaddq $H1,$D3,$D3 # d3 += h1*r2
+ vmovdqa 0x50(%rsp),$H2 # r3^2
+ vpaddq $H3,$D2,$D2 # d2 += h0*r2
+ vpmuludq $T4,$H4,$H0 # h4*s2
+ vpmuludq $T3,$H4,$H4 # h3*s2
+ vpaddq $H0,$D1,$D1 # d1 += h4*s2
+ vmovdqa 0x60(%rsp),$H3 # s3^2
+ vpaddq $H4,$D0,$D0 # d0 += h3*s2
+
+ vmovdqa 0x80(%rsp),$H4 # s4^2
+ vpmuludq $T1,$H2,$H1 # h1*r3
+ vpmuludq $T0,$H2,$H2 # h0*r3
+ vpaddq $H1,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $T4,$H3,$H0 # h4*s3
+ vpmuludq $T3,$H3,$H1 # h3*s3
+ vpaddq $H0,$D2,$D2 # d2 += h4*s3
+ vmovdqu 16*0($inp),$H0 # load input
+ vpaddq $H1,$D1,$D1 # d1 += h3*s3
+ vpmuludq $T2,$H3,$H3 # h2*s3
+ vpmuludq $T2,$H4,$T2 # h2*s4
+ vpaddq $H3,$D0,$D0 # d0 += h2*s3
+
+ vmovdqu 16*1($inp),$H1 #
+ vpaddq $T2,$D1,$D1 # d1 += h2*s4
+ vpmuludq $T3,$H4,$T3 # h3*s4
+ vpmuludq $T4,$H4,$T4 # h4*s4
+ vpsrldq \$6,$H0,$H2 # splat input
+ vpaddq $T3,$D2,$D2 # d2 += h3*s4
+ vpaddq $T4,$D3,$D3 # d3 += h4*s4
+ vpsrldq \$6,$H1,$H3 #
+ vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
+ vpmuludq $T1,$H4,$T0 # h1*s4
+ vpunpckhqdq $H1,$H0,$H4 # 4
+ vpaddq $T4,$D4,$D4 # d4 += h0*r4
+ vmovdqa -0x90(%r11),$T4 # r0^4
+ vpaddq $T0,$D0,$D0 # d0 += h1*s4
+
+ vpunpcklqdq $H1,$H0,$H0 # 0:1
+ vpunpcklqdq $H3,$H2,$H3 # 2:3
+
+ #vpsrlq \$40,$H4,$H4 # 4
+ vpsrldq \$`40/8`,$H4,$H4 # 4
+ vpsrlq \$26,$H0,$H1
+ vpand $MASK,$H0,$H0 # 0
+ vpsrlq \$4,$H3,$H2
+ vpand $MASK,$H1,$H1 # 1
+ vpand 0(%rcx),$H4,$H4 # .Lmask24
+ vpsrlq \$30,$H3,$H3
+ vpand $MASK,$H2,$H2 # 2
+ vpand $MASK,$H3,$H3 # 3
+ vpor 32(%rcx),$H4,$H4 # padbit, yes, always
+
+ vpaddq 0x00(%r11),$H0,$H0 # add hash value
+ vpaddq 0x10(%r11),$H1,$H1
+ vpaddq 0x20(%r11),$H2,$H2
+ vpaddq 0x30(%r11),$H3,$H3
+ vpaddq 0x40(%r11),$H4,$H4
+
+ lea 16*2($inp),%rax
+ lea 16*4($inp),$inp
+ sub \$64,$len
+ cmovc %rax,$inp
+
+ ################################################################
+ # Now we accumulate (inp[0:1]+hash)*r^4
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ vpmuludq $H0,$T4,$T0 # h0*r0
+ vpmuludq $H1,$T4,$T1 # h1*r0
+ vpaddq $T0,$D0,$D0
+ vpaddq $T1,$D1,$D1
+ vmovdqa -0x80(%r11),$T2 # r1^4
+ vpmuludq $H2,$T4,$T0 # h2*r0
+ vpmuludq $H3,$T4,$T1 # h3*r0
+ vpaddq $T0,$D2,$D2
+ vpaddq $T1,$D3,$D3
+ vpmuludq $H4,$T4,$T4 # h4*r0
+ vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
+ vpaddq $T4,$D4,$D4
+
+ vpaddq $T0,$D0,$D0 # d0 += h4*s1
+ vpmuludq $H2,$T2,$T1 # h2*r1
+ vpmuludq $H3,$T2,$T0 # h3*r1
+ vpaddq $T1,$D3,$D3 # d3 += h2*r1
+ vmovdqa -0x60(%r11),$T3 # r2^4
+ vpaddq $T0,$D4,$D4 # d4 += h3*r1
+ vpmuludq $H1,$T2,$T1 # h1*r1
+ vpmuludq $H0,$T2,$T2 # h0*r1
+ vpaddq $T1,$D2,$D2 # d2 += h1*r1
+ vpaddq $T2,$D1,$D1 # d1 += h0*r1
+
+ vmovdqa -0x50(%r11),$T4 # s2^4
+ vpmuludq $H2,$T3,$T0 # h2*r2
+ vpmuludq $H1,$T3,$T1 # h1*r2
+ vpaddq $T0,$D4,$D4 # d4 += h2*r2
+ vpaddq $T1,$D3,$D3 # d3 += h1*r2
+ vmovdqa -0x40(%r11),$T2 # r3^4
+ vpmuludq $H0,$T3,$T3 # h0*r2
+ vpmuludq $H4,$T4,$T0 # h4*s2
+ vpaddq $T3,$D2,$D2 # d2 += h0*r2
+ vpaddq $T0,$D1,$D1 # d1 += h4*s2
+ vmovdqa -0x30(%r11),$T3 # s3^4
+ vpmuludq $H3,$T4,$T4 # h3*s2
+ vpmuludq $H1,$T2,$T1 # h1*r3
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+
+ vmovdqa -0x10(%r11),$T4 # s4^4
+ vpaddq $T1,$D4,$D4 # d4 += h1*r3
+ vpmuludq $H0,$T2,$T2 # h0*r3
+ vpmuludq $H4,$T3,$T0 # h4*s3
+ vpaddq $T2,$D3,$D3 # d3 += h0*r3
+ vpaddq $T0,$D2,$D2 # d2 += h4*s3
+ vmovdqu 16*2($inp),$T0 # load input
+ vpmuludq $H3,$T3,$T2 # h3*s3
+ vpmuludq $H2,$T3,$T3 # h2*s3
+ vpaddq $T2,$D1,$D1 # d1 += h3*s3
+ vmovdqu 16*3($inp),$T1 #
+ vpaddq $T3,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $H2,$T4,$H2 # h2*s4
+ vpmuludq $H3,$T4,$H3 # h3*s4
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpaddq $H2,$D1,$D1 # d1 += h2*s4
+ vpmuludq $H4,$T4,$H4 # h4*s4
+ vpsrldq \$6,$T1,$T3 #
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
+ vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
+ vpmuludq $H1,$T4,$H0
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+
+ #vpsrlq \$40,$T4,$T4 # 4
+ vpsrldq \$`40/8`,$T4,$T4 # 4
+ vpsrlq \$26,$T0,$T1
+ vmovdqa 0x00(%rsp),$D4 # preload r0^2
+ vpand $MASK,$T0,$T0 # 0
+ vpsrlq \$4,$T3,$T2
+ vpand $MASK,$T1,$T1 # 1
+ vpand 0(%rcx),$T4,$T4 # .Lmask24
+ vpsrlq \$30,$T3,$T3
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ ################################################################
+ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ # and P. Schwabe
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D0
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D0,$H0,$H0
+ vpsllq \$2,$D0,$D0
+ vpaddq $D0,$H0,$H0 # h4 -> h0
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ ja .Loop_avx
+
+.Lskip_loop_avx:
+ ################################################################
+ # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
+ add \$32,$len
+ jnz .Long_tail_avx
+
+ vpaddq $H2,$T2,$T2
+ vpaddq $H0,$T0,$T0
+ vpaddq $H1,$T1,$T1
+ vpaddq $H3,$T3,$T3
+ vpaddq $H4,$T4,$T4
+
+.Long_tail_avx:
+ vmovdqa $H2,0x20(%r11)
+ vmovdqa $H0,0x00(%r11)
+ vmovdqa $H1,0x10(%r11)
+ vmovdqa $H3,0x30(%r11)
+ vmovdqa $H4,0x40(%r11)
+
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ vpmuludq $T2,$D4,$D2 # d2 = h2*r0
+ vpmuludq $T0,$D4,$D0 # d0 = h0*r0
+ vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
+ vpmuludq $T1,$D4,$D1 # d1 = h1*r0
+ vpmuludq $T3,$D4,$D3 # d3 = h3*r0
+ vpmuludq $T4,$D4,$D4 # d4 = h4*r0
+
+ vpmuludq $T3,$H2,$H0 # h3*r1
+ vpaddq $H0,$D4,$D4 # d4 += h3*r1
+ vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
+ vpmuludq $T2,$H2,$H1 # h2*r1
+ vpaddq $H1,$D3,$D3 # d3 += h2*r1
+ vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
+ vpmuludq $T1,$H2,$H0 # h1*r1
+ vpaddq $H0,$D2,$D2 # d2 += h1*r1
+ vpmuludq $T0,$H2,$H2 # h0*r1
+ vpaddq $H2,$D1,$D1 # d1 += h0*r1
+ vpmuludq $T4,$H3,$H3 # h4*s1
+ vpaddq $H3,$D0,$D0 # d0 += h4*s1
+
+ vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
+ vpmuludq $T2,$H4,$H1 # h2*r2
+ vpaddq $H1,$D4,$D4 # d4 += h2*r2
+ vpmuludq $T1,$H4,$H0 # h1*r2
+ vpaddq $H0,$D3,$D3 # d3 += h1*r2
+ vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
+ vpmuludq $T0,$H4,$H4 # h0*r2
+ vpaddq $H4,$D2,$D2 # d2 += h0*r2
+ vpmuludq $T4,$H2,$H1 # h4*s2
+ vpaddq $H1,$D1,$D1 # d1 += h4*s2
+ vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
+ vpmuludq $T3,$H2,$H2 # h3*s2
+ vpaddq $H2,$D0,$D0 # d0 += h3*s2
+
+ vpmuludq $T1,$H3,$H0 # h1*r3
+ vpaddq $H0,$D4,$D4 # d4 += h1*r3
+ vpmuludq $T0,$H3,$H3 # h0*r3
+ vpaddq $H3,$D3,$D3 # d3 += h0*r3
+ vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
+ vpmuludq $T4,$H4,$H1 # h4*s3
+ vpaddq $H1,$D2,$D2 # d2 += h4*s3
+ vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
+ vpmuludq $T3,$H4,$H0 # h3*s3
+ vpaddq $H0,$D1,$D1 # d1 += h3*s3
+ vpmuludq $T2,$H4,$H4 # h2*s3
+ vpaddq $H4,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $T0,$H2,$H2 # h0*r4
+ vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
+ vpmuludq $T4,$H3,$H1 # h4*s4
+ vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
+ vpmuludq $T3,$H3,$H0 # h3*s4
+ vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
+ vpmuludq $T2,$H3,$H1 # h2*s4
+ vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
+ vpmuludq $T1,$H3,$H3 # h1*s4
+ vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
+
+ jz .Lshort_tail_avx
+
+ vmovdqu 16*0($inp),$H0 # load input
+ vmovdqu 16*1($inp),$H1
+
+ vpsrldq \$6,$H0,$H2 # splat input
+ vpsrldq \$6,$H1,$H3
+ vpunpckhqdq $H1,$H0,$H4 # 4
+ vpunpcklqdq $H1,$H0,$H0 # 0:1
+ vpunpcklqdq $H3,$H2,$H3 # 2:3
+
+ vpsrlq \$40,$H4,$H4 # 4
+ vpsrlq \$26,$H0,$H1
+ vpand $MASK,$H0,$H0 # 0
+ vpsrlq \$4,$H3,$H2
+ vpand $MASK,$H1,$H1 # 1
+ vpsrlq \$30,$H3,$H3
+ vpand $MASK,$H2,$H2 # 2
+ vpand $MASK,$H3,$H3 # 3
+ vpor 32(%rcx),$H4,$H4 # padbit, yes, always
+
+ vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
+ vpaddq 0x00(%r11),$H0,$H0
+ vpaddq 0x10(%r11),$H1,$H1
+ vpaddq 0x20(%r11),$H2,$H2
+ vpaddq 0x30(%r11),$H3,$H3
+ vpaddq 0x40(%r11),$H4,$H4
+
+ ################################################################
+ # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
+
+ vpmuludq $H0,$T4,$T0 # h0*r0
+ vpaddq $T0,$D0,$D0 # d0 += h0*r0
+ vpmuludq $H1,$T4,$T1 # h1*r0
+ vpaddq $T1,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H2,$T4,$T0 # h2*r0
+ vpaddq $T0,$D2,$D2 # d2 += h2*r0
+ vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
+ vpmuludq $H3,$T4,$T1 # h3*r0
+ vpaddq $T1,$D3,$D3 # d3 += h3*r0
+ vpmuludq $H4,$T4,$T4 # h4*r0
+ vpaddq $T4,$D4,$D4 # d4 += h4*r0
+
+ vpmuludq $H3,$T2,$T0 # h3*r1
+ vpaddq $T0,$D4,$D4 # d4 += h3*r1
+ vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
+ vpmuludq $H2,$T2,$T1 # h2*r1
+ vpaddq $T1,$D3,$D3 # d3 += h2*r1
+ vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
+ vpmuludq $H1,$T2,$T0 # h1*r1
+ vpaddq $T0,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H0,$T2,$T2 # h0*r1
+ vpaddq $T2,$D1,$D1 # d1 += h0*r1
+ vpmuludq $H4,$T3,$T3 # h4*s1
+ vpaddq $T3,$D0,$D0 # d0 += h4*s1
+
+ vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
+ vpmuludq $H2,$T4,$T1 # h2*r2
+ vpaddq $T1,$D4,$D4 # d4 += h2*r2
+ vpmuludq $H1,$T4,$T0 # h1*r2
+ vpaddq $T0,$D3,$D3 # d3 += h1*r2
+ vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
+ vpmuludq $H0,$T4,$T4 # h0*r2
+ vpaddq $T4,$D2,$D2 # d2 += h0*r2
+ vpmuludq $H4,$T2,$T1 # h4*s2
+ vpaddq $T1,$D1,$D1 # d1 += h4*s2
+ vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
+ vpmuludq $H3,$T2,$T2 # h3*s2
+ vpaddq $T2,$D0,$D0 # d0 += h3*s2
+
+ vpmuludq $H1,$T3,$T0 # h1*r3
+ vpaddq $T0,$D4,$D4 # d4 += h1*r3
+ vpmuludq $H0,$T3,$T3 # h0*r3
+ vpaddq $T3,$D3,$D3 # d3 += h0*r3
+ vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
+ vpmuludq $H4,$T4,$T1 # h4*s3
+ vpaddq $T1,$D2,$D2 # d2 += h4*s3
+ vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
+ vpmuludq $H3,$T4,$T0 # h3*s3
+ vpaddq $T0,$D1,$D1 # d1 += h3*s3
+ vpmuludq $H2,$T4,$T4 # h2*s3
+ vpaddq $T4,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $H0,$T2,$T2 # h0*r4
+ vpaddq $T2,$D4,$D4 # d4 += h0*r4
+ vpmuludq $H4,$T3,$T1 # h4*s4
+ vpaddq $T1,$D3,$D3 # d3 += h4*s4
+ vpmuludq $H3,$T3,$T0 # h3*s4
+ vpaddq $T0,$D2,$D2 # d2 += h3*s4
+ vpmuludq $H2,$T3,$T1 # h2*s4
+ vpaddq $T1,$D1,$D1 # d1 += h2*s4
+ vpmuludq $H1,$T3,$T3 # h1*s4
+ vpaddq $T3,$D0,$D0 # d0 += h1*s4
+
+.Lshort_tail_avx:
+ ################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D4,$T4
+ vpsrldq \$8,$D3,$T3
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$D0,$T0
+ vpsrldq \$8,$D2,$T2
+ vpaddq $T3,$D3,$D3
+ vpaddq $T4,$D4,$D4
+ vpaddq $T0,$D0,$D0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$D2,$D2
+
+ ################################################################
+ # lazy reduction
+
+ vpsrlq \$26,$D3,$H3
+ vpand $MASK,$D3,$D3
+ vpaddq $H3,$D4,$D4 # h3 -> h4
+
+ vpsrlq \$26,$D0,$H0
+ vpand $MASK,$D0,$D0
+ vpaddq $H0,$D1,$D1 # h0 -> h1
+
+ vpsrlq \$26,$D4,$H4
+ vpand $MASK,$D4,$D4
+
+ vpsrlq \$26,$D1,$H1
+ vpand $MASK,$D1,$D1
+ vpaddq $H1,$D2,$D2 # h1 -> h2
+
+ vpaddq $H4,$D0,$D0
+ vpsllq \$2,$H4,$H4
+ vpaddq $H4,$D0,$D0 # h4 -> h0
+
+ vpsrlq \$26,$D2,$H2
+ vpand $MASK,$D2,$D2
+ vpaddq $H2,$D3,$D3 # h2 -> h3
+
+ vpsrlq \$26,$D0,$H0
+ vpand $MASK,$D0,$D0
+ vpaddq $H0,$D1,$D1 # h0 -> h1
+
+ vpsrlq \$26,$D3,$H3
+ vpand $MASK,$D3,$D3
+ vpaddq $H3,$D4,$D4 # h3 -> h4
+
+ vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
+ vmovd $D1,`4*1-48-64`($ctx)
+ vmovd $D2,`4*2-48-64`($ctx)
+ vmovd $D3,`4*3-48-64`($ctx)
+ vmovd $D4,`4*4-48-64`($ctx)
+___
+$code.=<<___ if ($win64);
+ vmovdqa 0x50(%r11),%xmm6
+ vmovdqa 0x60(%r11),%xmm7
+ vmovdqa 0x70(%r11),%xmm8
+ vmovdqa 0x80(%r11),%xmm9
+ vmovdqa 0x90(%r11),%xmm10
+ vmovdqa 0xa0(%r11),%xmm11
+ vmovdqa 0xb0(%r11),%xmm12
+ vmovdqa 0xc0(%r11),%xmm13
+ vmovdqa 0xd0(%r11),%xmm14
+ vmovdqa 0xe0(%r11),%xmm15
+ lea 0xf8(%r11),%rsp
+.Ldo_avx_epilogue:
+___
+$code.=<<___ if (!$win64);
+ lea -8(%r10),%rsp
+.cfi_def_cfa_register %rsp
+___
+$code.=<<___;
+ vzeroupper
+ RET
+.cfi_endproc
+___
+&end_function("poly1305_blocks_avx");
+
+&declare_function("poly1305_emit_avx", 32, 3);
+$code.=<<___;
+ cmpl \$0,20($ctx) # is_base2_26?
+ je .Lemit
+
+ mov 0($ctx),%eax # load hash value base 2^26
+ mov 4($ctx),%ecx
+ mov 8($ctx),%r8d
+ mov 12($ctx),%r11d
+ mov 16($ctx),%r10d
+
+ shl \$26,%rcx # base 2^26 -> base 2^64
+ mov %r8,%r9
+ shl \$52,%r8
+ add %rcx,%rax
+ shr \$12,%r9
+ add %rax,%r8 # h0
+ adc \$0,%r9
+
+ shl \$14,%r11
+ mov %r10,%rax
+ shr \$24,%r10
+ add %r11,%r9
+ shl \$40,%rax
+ add %rax,%r9 # h1
+ adc \$0,%r10 # h2
+
+ mov %r10,%rax # could be partially reduced, so reduce
+ mov %r10,%rcx
+ and \$3,%r10
+ shr \$2,%rax
+ and \$-4,%rcx
+ add %rcx,%rax
+ add %rax,%r8
+ adc \$0,%r9
+ adc \$0,%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overflow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ RET
+___
+&end_function("poly1305_emit_avx");
+
+if ($avx>1) {
+
+my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
+ map("%ymm$_",(0..15));
+my $S4=$MASK;
+
+sub poly1305_blocks_avxN {
+ my ($avx512) = @_;
+ my $suffix = $avx512 ? "_avx512" : "";
+$code.=<<___;
+.cfi_startproc
+ mov 20($ctx),%r8d # is_base2_26
+ cmp \$128,$len
+ jae .Lblocks_avx2$suffix
+ test %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx2$suffix:
+ and \$-16,$len
+ jz .Lno_data_avx2$suffix
+
+ vzeroupper
+
+ test %r8d,%r8d
+ jz .Lbase2_64_avx2$suffix
+
+ test \$63,$len
+ jz .Leven_avx2$suffix
+
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lblocks_avx2_body$suffix:
+
+ mov $len,%r15 # reassign $len
+
+ mov 0($ctx),$d1 # load hash value
+ mov 8($ctx),$d2
+ mov 16($ctx),$h2#d
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ ################################# base 2^26 -> base 2^64
+ mov $d1#d,$h0#d
+ and \$`-1*(1<<31)`,$d1
+ mov $d2,$r1 # borrow $r1
+ mov $d2#d,$h1#d
+ and \$`-1*(1<<31)`,$d2
+
+ shr \$6,$d1
+ shl \$52,$r1
+ add $d1,$h0
+ shr \$12,$h1
+ shr \$18,$d2
+ add $r1,$h0
+ adc $d2,$h1
+
+ mov $h2,$d1
+ shl \$40,$d1
+ shr \$24,$h2
+ add $d1,$h1
+ adc \$0,$h2 # can be partially reduced...
+
+ mov \$-4,$d2 # ... so reduce
+ mov $h2,$d1
+ and $h2,$d2
+ shr \$2,$d1
+ and \$3,$h2
+ add $d2,$d1 # =*5
+ add $d1,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+.Lbase2_26_pre_avx2$suffix:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+ mov $r1,%rax
+
+ test \$63,%r15
+ jnz .Lbase2_26_pre_avx2$suffix
+
+ test $padbit,$padbit # if $padbit is zero,
+ jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
+
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$r0
+ mov $h1,$r1
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$r0
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $r0,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$r1
+ and \$0x3ffffff,$h1 # h[3]
+ or $r1,$h2 # h[4]
+
+ test %r15,%r15
+ jz .Lstore_base2_26_avx2$suffix
+
+ vmovd %rax#d,%x#$H0
+ vmovd %rdx#d,%x#$H1
+ vmovd $h0#d,%x#$H2
+ vmovd $h1#d,%x#$H3
+ vmovd $h2#d,%x#$H4
+ jmp .Lproceed_avx2$suffix
+
+.align 32
+.Lstore_base2_64_avx2$suffix:
+ mov $h0,0($ctx)
+ mov $h1,8($ctx)
+ mov $h2,16($ctx) # note that is_base2_26 is zeroed
+ jmp .Ldone_avx2$suffix
+
+.align 16
+.Lstore_base2_26_avx2$suffix:
+ mov %rax#d,0($ctx) # store hash value base 2^26
+ mov %rdx#d,4($ctx)
+ mov $h0#d,8($ctx)
+ mov $h1#d,12($ctx)
+ mov $h2#d,16($ctx)
+.align 16
+.Ldone_avx2$suffix:
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lno_data_avx2$suffix:
+.Lblocks_avx2_epilogue$suffix:
+ RET
+.cfi_endproc
+
+.align 32
+.Lbase2_64_avx2$suffix:
+.cfi_startproc
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lbase2_64_avx2_body$suffix:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2#d
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ test \$63,$len
+ jz .Linit_avx2$suffix
+
+.Lbase2_64_pre_avx2$suffix:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+ mov $r1,%rax
+
+ test \$63,%r15
+ jnz .Lbase2_64_pre_avx2$suffix
+
+.Linit_avx2$suffix:
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$d1
+ mov $h1,$d2
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$d1
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $d1,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$d2
+ and \$0x3ffffff,$h1 # h[3]
+ or $d2,$h2 # h[4]
+
+ vmovd %rax#d,%x#$H0
+ vmovd %rdx#d,%x#$H1
+ vmovd $h0#d,%x#$H2
+ vmovd $h1#d,%x#$H3
+ vmovd $h2#d,%x#$H4
+ movl \$1,20($ctx) # set is_base2_26
+
+ call __poly1305_init_avx
+
+.Lproceed_avx2$suffix:
+ mov %r15,$len # restore $len
+___
+$code.=<<___ if (!$kernel);
+ mov OPENSSL_ia32cap_P+8(%rip),%r9d
+ mov \$`(1<<31|1<<30|1<<16)`,%r11d
+___
+$code.=<<___;
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lbase2_64_avx2_epilogue$suffix:
+ jmp .Ldo_avx2$suffix
+.cfi_endproc
+
+.align 32
+.Leven_avx2$suffix:
+.cfi_startproc
+___
+$code.=<<___ if (!$kernel);
+ mov OPENSSL_ia32cap_P+8(%rip),%r9d
+___
+$code.=<<___;
+ vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
+ vmovd 4*1($ctx),%x#$H1
+ vmovd 4*2($ctx),%x#$H2
+ vmovd 4*3($ctx),%x#$H3
+ vmovd 4*4($ctx),%x#$H4
+
+.Ldo_avx2$suffix:
+___
+$code.=<<___ if (!$kernel && $avx>2);
+ cmp \$512,$len
+ jb .Lskip_avx512
+ and %r11d,%r9d
+ test \$`1<<16`,%r9d # check for AVX512F
+ jnz .Lblocks_avx512
+.Lskip_avx512$suffix:
+___
+$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
+ cmp \$512,$len
+ jae .Lblocks_avx512
+___
+$code.=<<___ if (!$win64);
+ lea 8(%rsp),%r10
+.cfi_def_cfa_register %r10
+ sub \$0x128,%rsp
+___
+$code.=<<___ if ($win64);
+ lea 8(%rsp),%r10
+ sub \$0x1c8,%rsp
+ vmovdqa %xmm6,-0xb0(%r10)
+ vmovdqa %xmm7,-0xa0(%r10)
+ vmovdqa %xmm8,-0x90(%r10)
+ vmovdqa %xmm9,-0x80(%r10)
+ vmovdqa %xmm10,-0x70(%r10)
+ vmovdqa %xmm11,-0x60(%r10)
+ vmovdqa %xmm12,-0x50(%r10)
+ vmovdqa %xmm13,-0x40(%r10)
+ vmovdqa %xmm14,-0x30(%r10)
+ vmovdqa %xmm15,-0x20(%r10)
+.Ldo_avx2_body$suffix:
+___
+$code.=<<___;
+ lea .Lconst(%rip),%rcx
+ lea 48+64($ctx),$ctx # size optimization
+ vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
+
+ # expand and copy pre-calculated table to stack
+ vmovdqu `16*0-64`($ctx),%x#$T2
+ and \$-512,%rsp
+ vmovdqu `16*1-64`($ctx),%x#$T3
+ vmovdqu `16*2-64`($ctx),%x#$T4
+ vmovdqu `16*3-64`($ctx),%x#$D0
+ vmovdqu `16*4-64`($ctx),%x#$D1
+ vmovdqu `16*5-64`($ctx),%x#$D2
+ lea 0x90(%rsp),%rax # size optimization
+ vmovdqu `16*6-64`($ctx),%x#$D3
+ vpermd $T2,$T0,$T2 # 00003412 -> 14243444
+ vmovdqu `16*7-64`($ctx),%x#$D4
+ vpermd $T3,$T0,$T3
+ vmovdqu `16*8-64`($ctx),%x#$MASK
+ vpermd $T4,$T0,$T4
+ vmovdqa $T2,0x00(%rsp)
+ vpermd $D0,$T0,$D0
+ vmovdqa $T3,0x20-0x90(%rax)
+ vpermd $D1,$T0,$D1
+ vmovdqa $T4,0x40-0x90(%rax)
+ vpermd $D2,$T0,$D2
+ vmovdqa $D0,0x60-0x90(%rax)
+ vpermd $D3,$T0,$D3
+ vmovdqa $D1,0x80-0x90(%rax)
+ vpermd $D4,$T0,$D4
+ vmovdqa $D2,0xa0-0x90(%rax)
+ vpermd $MASK,$T0,$MASK
+ vmovdqa $D3,0xc0-0x90(%rax)
+ vmovdqa $D4,0xe0-0x90(%rax)
+ vmovdqa $MASK,0x100-0x90(%rax)
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+
+ ################################################################
+ # load input
+ vmovdqu 16*0($inp),%x#$T0
+ vmovdqu 16*1($inp),%x#$T1
+ vinserti128 \$1,16*2($inp),$T0,$T0
+ vinserti128 \$1,16*3($inp),$T1,$T1
+ lea 16*4($inp),$inp
+
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpsrldq \$6,$T1,$T3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpunpcklqdq $T3,$T2,$T2 # 2:3
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+
+ vpsrlq \$30,$T2,$T3
+ vpsrlq \$4,$T2,$T2
+ vpsrlq \$26,$T0,$T1
+ vpsrlq \$40,$T4,$T4 # 4
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T0,$T0 # 0
+ vpand $MASK,$T1,$T1 # 1
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ vpaddq $H2,$T2,$H2 # accumulate input
+ sub \$64,$len
+ jz .Ltail_avx2$suffix
+ jmp .Loop_avx2$suffix
+
+.align 32
+.Loop_avx2$suffix:
+ ################################################################
+ # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
+ # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
+ # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
+ # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
+ # \________/\__________/
+ ################################################################
+ #vpaddq $H2,$T2,$H2 # accumulate input
+ vpaddq $H0,$T0,$H0
+ vmovdqa `32*0`(%rsp),$T0 # r0^4
+ vpaddq $H1,$T1,$H1
+ vmovdqa `32*1`(%rsp),$T1 # r1^4
+ vpaddq $H3,$T3,$H3
+ vmovdqa `32*3`(%rsp),$T2 # r2^4
+ vpaddq $H4,$T4,$H4
+ vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
+ vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
+
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ #
+ # however, as h2 is "chronologically" first one available pull
+ # corresponding operations up, so it's
+ #
+ # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
+ # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
+ # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
+
+ vpmuludq $H2,$T0,$D2 # d2 = h2*r0
+ vpmuludq $H2,$T1,$D3 # d3 = h2*r1
+ vpmuludq $H2,$T2,$D4 # d4 = h2*r2
+ vpmuludq $H2,$T3,$D0 # d0 = h2*s3
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+
+ vpmuludq $H0,$T1,$T4 # h0*r1
+ vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
+ vpaddq $T4,$D1,$D1 # d1 += h0*r1
+ vpaddq $H2,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H3,$T1,$T4 # h3*r1
+ vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
+ vpaddq $T4,$D4,$D4 # d4 += h3*r1
+ vpaddq $H2,$D0,$D0 # d0 += h4*s1
+ vmovdqa `32*4-0x90`(%rax),$T1 # s2
+
+ vpmuludq $H0,$T0,$T4 # h0*r0
+ vpmuludq $H1,$T0,$H2 # h1*r0
+ vpaddq $T4,$D0,$D0 # d0 += h0*r0
+ vpaddq $H2,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H3,$T0,$T4 # h3*r0
+ vpmuludq $H4,$T0,$H2 # h4*r0
+ vmovdqu 16*0($inp),%x#$T0 # load input
+ vpaddq $T4,$D3,$D3 # d3 += h3*r0
+ vpaddq $H2,$D4,$D4 # d4 += h4*r0
+ vinserti128 \$1,16*2($inp),$T0,$T0
+
+ vpmuludq $H3,$T1,$T4 # h3*s2
+ vpmuludq $H4,$T1,$H2 # h4*s2
+ vmovdqu 16*1($inp),%x#$T1
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+ vpaddq $H2,$D1,$D1 # d1 += h4*s2
+ vmovdqa `32*5-0x90`(%rax),$H2 # r3
+ vpmuludq $H1,$T2,$T4 # h1*r2
+ vpmuludq $H0,$T2,$T2 # h0*r2
+ vpaddq $T4,$D3,$D3 # d3 += h1*r2
+ vpaddq $T2,$D2,$D2 # d2 += h0*r2
+ vinserti128 \$1,16*3($inp),$T1,$T1
+ lea 16*4($inp),$inp
+
+ vpmuludq $H1,$H2,$T4 # h1*r3
+ vpmuludq $H0,$H2,$H2 # h0*r3
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpaddq $T4,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $H3,$T3,$T4 # h3*s3
+ vpmuludq $H4,$T3,$H2 # h4*s3
+ vpsrldq \$6,$T1,$T3
+ vpaddq $T4,$D1,$D1 # d1 += h3*s3
+ vpaddq $H2,$D2,$D2 # d2 += h4*s3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+
+ vpmuludq $H3,$S4,$H3 # h3*s4
+ vpmuludq $H4,$S4,$H4 # h4*s4
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+ vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
+ vpmuludq $H1,$S4,$H0 # h1*s4
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ ################################################################
+ # lazy reduction (interleaved with tail of input splat)
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D4
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$4,$T3,$T2
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpand $MASK,$T2,$T2 # 2
+ vpsrlq \$26,$T0,$T1
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpaddq $T2,$H2,$H2 # modulo-scheduled
+ vpsrlq \$30,$T3,$T3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$40,$T4,$T4 # 4
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpand $MASK,$T0,$T0 # 0
+ vpand $MASK,$T1,$T1 # 1
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ sub \$64,$len
+ jnz .Loop_avx2$suffix
+
+ .byte 0x66,0x90
+.Ltail_avx2$suffix:
+ ################################################################
+ # while above multiplications were by r^4 in all lanes, in last
+ # iteration we multiply least significant lane by r^4 and most
+ # significant one by r, so copy of above except that references
+ # to the precomputed table are displaced by 4...
+
+ #vpaddq $H2,$T2,$H2 # accumulate input
+ vpaddq $H0,$T0,$H0
+ vmovdqu `32*0+4`(%rsp),$T0 # r0^4
+ vpaddq $H1,$T1,$H1
+ vmovdqu `32*1+4`(%rsp),$T1 # r1^4
+ vpaddq $H3,$T3,$H3
+ vmovdqu `32*3+4`(%rsp),$T2 # r2^4
+ vpaddq $H4,$T4,$H4
+ vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
+ vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
+
+ vpmuludq $H2,$T0,$D2 # d2 = h2*r0
+ vpmuludq $H2,$T1,$D3 # d3 = h2*r1
+ vpmuludq $H2,$T2,$D4 # d4 = h2*r2
+ vpmuludq $H2,$T3,$D0 # d0 = h2*s3
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+
+ vpmuludq $H0,$T1,$T4 # h0*r1
+ vpmuludq $H1,$T1,$H2 # h1*r1
+ vpaddq $T4,$D1,$D1 # d1 += h0*r1
+ vpaddq $H2,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H3,$T1,$T4 # h3*r1
+ vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
+ vpaddq $T4,$D4,$D4 # d4 += h3*r1
+ vpaddq $H2,$D0,$D0 # d0 += h4*s1
+
+ vpmuludq $H0,$T0,$T4 # h0*r0
+ vpmuludq $H1,$T0,$H2 # h1*r0
+ vpaddq $T4,$D0,$D0 # d0 += h0*r0
+ vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
+ vpaddq $H2,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H3,$T0,$T4 # h3*r0
+ vpmuludq $H4,$T0,$H2 # h4*r0
+ vpaddq $T4,$D3,$D3 # d3 += h3*r0
+ vpaddq $H2,$D4,$D4 # d4 += h4*r0
+
+ vpmuludq $H3,$T1,$T4 # h3*s2
+ vpmuludq $H4,$T1,$H2 # h4*s2
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+ vpaddq $H2,$D1,$D1 # d1 += h4*s2
+ vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
+ vpmuludq $H1,$T2,$T4 # h1*r2
+ vpmuludq $H0,$T2,$T2 # h0*r2
+ vpaddq $T4,$D3,$D3 # d3 += h1*r2
+ vpaddq $T2,$D2,$D2 # d2 += h0*r2
+
+ vpmuludq $H1,$H2,$T4 # h1*r3
+ vpmuludq $H0,$H2,$H2 # h0*r3
+ vpaddq $T4,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $H3,$T3,$T4 # h3*s3
+ vpmuludq $H4,$T3,$H2 # h4*s3
+ vpaddq $T4,$D1,$D1 # d1 += h3*s3
+ vpaddq $H2,$D2,$D2 # d2 += h4*s3
+
+ vpmuludq $H3,$S4,$H3 # h3*s4
+ vpmuludq $H4,$S4,$H4 # h4*s4
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
+ vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
+ vpmuludq $H1,$S4,$H0 # h1*s4
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ ################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$H2,$T2
+ vpsrldq \$8,$H3,$T3
+ vpsrldq \$8,$H4,$T4
+ vpsrldq \$8,$H0,$T0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+
+ vpermq \$0x2,$H3,$T3
+ vpermq \$0x2,$H4,$T4
+ vpermq \$0x2,$H0,$T0
+ vpermq \$0x2,$D1,$T1
+ vpermq \$0x2,$H2,$T2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+
+ ################################################################
+ # lazy reduction
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D4
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
+ vmovd %x#$H1,`4*1-48-64`($ctx)
+ vmovd %x#$H2,`4*2-48-64`($ctx)
+ vmovd %x#$H3,`4*3-48-64`($ctx)
+ vmovd %x#$H4,`4*4-48-64`($ctx)
+___
+$code.=<<___ if ($win64);
+ vmovdqa -0xb0(%r10),%xmm6
+ vmovdqa -0xa0(%r10),%xmm7
+ vmovdqa -0x90(%r10),%xmm8
+ vmovdqa -0x80(%r10),%xmm9
+ vmovdqa -0x70(%r10),%xmm10
+ vmovdqa -0x60(%r10),%xmm11
+ vmovdqa -0x50(%r10),%xmm12
+ vmovdqa -0x40(%r10),%xmm13
+ vmovdqa -0x30(%r10),%xmm14
+ vmovdqa -0x20(%r10),%xmm15
+ lea -8(%r10),%rsp
+.Ldo_avx2_epilogue$suffix:
+___
+$code.=<<___ if (!$win64);
+ lea -8(%r10),%rsp
+.cfi_def_cfa_register %rsp
+___
+$code.=<<___;
+ vzeroupper
+ RET
+.cfi_endproc
+___
+if($avx > 2 && $avx512) {
+my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
+my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
+my $PADBIT="%zmm30";
+
+map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
+map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
+map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
+map(s/%y/%z/,($MASK));
+
+$code.=<<___;
+.cfi_startproc
+.Lblocks_avx512:
+ mov \$15,%eax
+ kmovw %eax,%k2
+___
+$code.=<<___ if (!$win64);
+ lea 8(%rsp),%r10
+.cfi_def_cfa_register %r10
+ sub \$0x128,%rsp
+___
+$code.=<<___ if ($win64);
+ lea 8(%rsp),%r10
+ sub \$0x1c8,%rsp
+ vmovdqa %xmm6,-0xb0(%r10)
+ vmovdqa %xmm7,-0xa0(%r10)
+ vmovdqa %xmm8,-0x90(%r10)
+ vmovdqa %xmm9,-0x80(%r10)
+ vmovdqa %xmm10,-0x70(%r10)
+ vmovdqa %xmm11,-0x60(%r10)
+ vmovdqa %xmm12,-0x50(%r10)
+ vmovdqa %xmm13,-0x40(%r10)
+ vmovdqa %xmm14,-0x30(%r10)
+ vmovdqa %xmm15,-0x20(%r10)
+.Ldo_avx512_body:
+___
+$code.=<<___;
+ lea .Lconst(%rip),%rcx
+ lea 48+64($ctx),$ctx # size optimization
+ vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
+
+ # expand pre-calculated table
+ vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
+ and \$-512,%rsp
+ vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
+ mov \$0x20,%rax
+ vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
+ vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
+ vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
+ vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
+ vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
+ vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
+ vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
+ vpermd $D0,$T2,$R0 # 00003412 -> 14243444
+ vpbroadcastq 64(%rcx),$MASK # .Lmask26
+ vpermd $D1,$T2,$R1
+ vpermd $T0,$T2,$S1
+ vpermd $D2,$T2,$R2
+ vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
+ vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
+ vpermd $T1,$T2,$S2
+ vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
+ vpsrlq \$32,$R1,$T1
+ vpermd $D3,$T2,$R3
+ vmovdqa64 $S1,0x40(%rsp){%k2}
+ vpermd $T3,$T2,$S3
+ vpermd $D4,$T2,$R4
+ vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
+ vpermd $T4,$T2,$S4
+ vmovdqa64 $S2,0x80(%rsp){%k2}
+ vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
+ vmovdqa64 $S3,0xc0(%rsp){%k2}
+ vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
+ vmovdqa64 $S4,0x100(%rsp){%k2}
+
+ ################################################################
+ # calculate 5th through 8th powers of the key
+ #
+ # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
+ # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
+ # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
+ # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
+ # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
+
+ vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
+ vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
+ vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
+ vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
+ vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
+ vpsrlq \$32,$R2,$T2
+
+ vpmuludq $T1,$S4,$M0
+ vpmuludq $T1,$R0,$M1
+ vpmuludq $T1,$R1,$M2
+ vpmuludq $T1,$R2,$M3
+ vpmuludq $T1,$R3,$M4
+ vpsrlq \$32,$R3,$T3
+ vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
+ vpaddq $M1,$D1,$D1 # d1 += r1'*r0
+ vpaddq $M2,$D2,$D2 # d2 += r1'*r1
+ vpaddq $M3,$D3,$D3 # d3 += r1'*r2
+ vpaddq $M4,$D4,$D4 # d4 += r1'*r3
+
+ vpmuludq $T2,$S3,$M0
+ vpmuludq $T2,$S4,$M1
+ vpmuludq $T2,$R1,$M3
+ vpmuludq $T2,$R2,$M4
+ vpmuludq $T2,$R0,$M2
+ vpsrlq \$32,$R4,$T4
+ vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
+ vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
+ vpaddq $M3,$D3,$D3 # d3 += r2'*r1
+ vpaddq $M4,$D4,$D4 # d4 += r2'*r2
+ vpaddq $M2,$D2,$D2 # d2 += r2'*r0
+
+ vpmuludq $T3,$S2,$M0
+ vpmuludq $T3,$R0,$M3
+ vpmuludq $T3,$R1,$M4
+ vpmuludq $T3,$S3,$M1
+ vpmuludq $T3,$S4,$M2
+ vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
+ vpaddq $M3,$D3,$D3 # d3 += r3'*r0
+ vpaddq $M4,$D4,$D4 # d4 += r3'*r1
+ vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
+ vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
+
+ vpmuludq $T4,$S4,$M3
+ vpmuludq $T4,$R0,$M4
+ vpmuludq $T4,$S1,$M0
+ vpmuludq $T4,$S2,$M1
+ vpmuludq $T4,$S3,$M2
+ vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
+ vpaddq $M4,$D4,$D4 # d4 += r2'*r0
+ vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
+ vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
+ vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
+
+ ################################################################
+ # load input
+ vmovdqu64 16*0($inp),%z#$T3
+ vmovdqu64 16*4($inp),%z#$T4
+ lea 16*8($inp),$inp
+
+ ################################################################
+ # lazy reduction
+
+ vpsrlq \$26,$D3,$M3
+ vpandq $MASK,$D3,$D3
+ vpaddq $M3,$D4,$D4 # d3 -> d4
+
+ vpsrlq \$26,$D0,$M0
+ vpandq $MASK,$D0,$D0
+ vpaddq $M0,$D1,$D1 # d0 -> d1
+
+ vpsrlq \$26,$D4,$M4
+ vpandq $MASK,$D4,$D4
+
+ vpsrlq \$26,$D1,$M1
+ vpandq $MASK,$D1,$D1
+ vpaddq $M1,$D2,$D2 # d1 -> d2
+
+ vpaddq $M4,$D0,$D0
+ vpsllq \$2,$M4,$M4
+ vpaddq $M4,$D0,$D0 # d4 -> d0
+
+ vpsrlq \$26,$D2,$M2
+ vpandq $MASK,$D2,$D2
+ vpaddq $M2,$D3,$D3 # d2 -> d3
+
+ vpsrlq \$26,$D0,$M0
+ vpandq $MASK,$D0,$D0
+ vpaddq $M0,$D1,$D1 # d0 -> d1
+
+ vpsrlq \$26,$D3,$M3
+ vpandq $MASK,$D3,$D3
+ vpaddq $M3,$D4,$D4 # d3 -> d4
+
+ ################################################################
+ # at this point we have 14243444 in $R0-$S4 and 05060708 in
+ # $D0-$D4, ...
+
+ vpunpcklqdq $T4,$T3,$T0 # transpose input
+ vpunpckhqdq $T4,$T3,$T4
+
+ # ... since input 64-bit lanes are ordered as 73625140, we could
+ # "vperm" it to 76543210 (here and in each loop iteration), *or*
+ # we could just flow along, hence the goal for $R0-$S4 is
+ # 1858286838784888 ...
+
+ vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
+ mov \$0x7777,%eax
+ kmovw %eax,%k1
+
+ vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
+ vpermd $R1,$M0,$R1
+ vpermd $R2,$M0,$R2
+ vpermd $R3,$M0,$R3
+ vpermd $R4,$M0,$R4
+
+ vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
+ vpermd $D1,$M0,${R1}{%k1}
+ vpermd $D2,$M0,${R2}{%k1}
+ vpermd $D3,$M0,${R3}{%k1}
+ vpermd $D4,$M0,${R4}{%k1}
+
+ vpslld \$2,$R1,$S1 # *5
+ vpslld \$2,$R2,$S2
+ vpslld \$2,$R3,$S3
+ vpslld \$2,$R4,$S4
+ vpaddd $R1,$S1,$S1
+ vpaddd $R2,$S2,$S2
+ vpaddd $R3,$S3,$S3
+ vpaddd $R4,$S4,$S4
+
+ vpbroadcastq 32(%rcx),$PADBIT # .L129
+
+ vpsrlq \$52,$T0,$T2 # splat input
+ vpsllq \$12,$T4,$T3
+ vporq $T3,$T2,$T2
+ vpsrlq \$26,$T0,$T1
+ vpsrlq \$14,$T4,$T3
+ vpsrlq \$40,$T4,$T4 # 4
+ vpandq $MASK,$T2,$T2 # 2
+ vpandq $MASK,$T0,$T0 # 0
+ #vpandq $MASK,$T1,$T1 # 1
+ #vpandq $MASK,$T3,$T3 # 3
+ #vporq $PADBIT,$T4,$T4 # padbit, yes, always
+
+ vpaddq $H2,$T2,$H2 # accumulate input
+ sub \$192,$len
+ jbe .Ltail_avx512
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+ ################################################################
+ # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
+ # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
+ # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
+ # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
+ # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
+ # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
+ # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
+ # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
+ # \________/\___________/
+ ################################################################
+ #vpaddq $H2,$T2,$H2 # accumulate input
+
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ #
+ # however, as h2 is "chronologically" first one available pull
+ # corresponding operations up, so it's
+ #
+ # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
+ # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
+ # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
+ # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
+ # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
+
+ vpmuludq $H2,$R1,$D3 # d3 = h2*r1
+ vpaddq $H0,$T0,$H0
+ vpmuludq $H2,$R2,$D4 # d4 = h2*r2
+ vpandq $MASK,$T1,$T1 # 1
+ vpmuludq $H2,$S3,$D0 # d0 = h2*s3
+ vpandq $MASK,$T3,$T3 # 3
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+ vporq $PADBIT,$T4,$T4 # padbit, yes, always
+ vpmuludq $H2,$R0,$D2 # d2 = h2*r0
+ vpaddq $H1,$T1,$H1 # accumulate input
+ vpaddq $H3,$T3,$H3
+ vpaddq $H4,$T4,$H4
+
+ vmovdqu64 16*0($inp),$T3 # load input
+ vmovdqu64 16*4($inp),$T4
+ lea 16*8($inp),$inp
+ vpmuludq $H0,$R3,$M3
+ vpmuludq $H0,$R4,$M4
+ vpmuludq $H0,$R0,$M0
+ vpmuludq $H0,$R1,$M1
+ vpaddq $M3,$D3,$D3 # d3 += h0*r3
+ vpaddq $M4,$D4,$D4 # d4 += h0*r4
+ vpaddq $M0,$D0,$D0 # d0 += h0*r0
+ vpaddq $M1,$D1,$D1 # d1 += h0*r1
+
+ vpmuludq $H1,$R2,$M3
+ vpmuludq $H1,$R3,$M4
+ vpmuludq $H1,$S4,$M0
+ vpmuludq $H0,$R2,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h1*r2
+ vpaddq $M4,$D4,$D4 # d4 += h1*r3
+ vpaddq $M0,$D0,$D0 # d0 += h1*s4
+ vpaddq $M2,$D2,$D2 # d2 += h0*r2
+
+ vpunpcklqdq $T4,$T3,$T0 # transpose input
+ vpunpckhqdq $T4,$T3,$T4
+
+ vpmuludq $H3,$R0,$M3
+ vpmuludq $H3,$R1,$M4
+ vpmuludq $H1,$R0,$M1
+ vpmuludq $H1,$R1,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h3*r0
+ vpaddq $M4,$D4,$D4 # d4 += h3*r1
+ vpaddq $M1,$D1,$D1 # d1 += h1*r0
+ vpaddq $M2,$D2,$D2 # d2 += h1*r1
+
+ vpmuludq $H4,$S4,$M3
+ vpmuludq $H4,$R0,$M4
+ vpmuludq $H3,$S2,$M0
+ vpmuludq $H3,$S3,$M1
+ vpaddq $M3,$D3,$D3 # d3 += h4*s4
+ vpmuludq $H3,$S4,$M2
+ vpaddq $M4,$D4,$D4 # d4 += h4*r0
+ vpaddq $M0,$D0,$D0 # d0 += h3*s2
+ vpaddq $M1,$D1,$D1 # d1 += h3*s3
+ vpaddq $M2,$D2,$D2 # d2 += h3*s4
+
+ vpmuludq $H4,$S1,$M0
+ vpmuludq $H4,$S2,$M1
+ vpmuludq $H4,$S3,$M2
+ vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
+ vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
+ vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
+
+ ################################################################
+ # lazy reduction (interleaved with input splat)
+
+ vpsrlq \$52,$T0,$T2 # splat input
+ vpsllq \$12,$T4,$T3
+
+ vpsrlq \$26,$D3,$H3
+ vpandq $MASK,$D3,$D3
+ vpaddq $H3,$D4,$H4 # h3 -> h4
+
+ vporq $T3,$T2,$T2
+
+ vpsrlq \$26,$H0,$D0
+ vpandq $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpandq $MASK,$T2,$T2 # 2
+
+ vpsrlq \$26,$H4,$D4
+ vpandq $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpandq $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpaddq $T2,$H2,$H2 # modulo-scheduled
+ vpsrlq \$26,$T0,$T1
+
+ vpsrlq \$26,$H2,$D2
+ vpandq $MASK,$H2,$H2
+ vpaddq $D2,$D3,$H3 # h2 -> h3
+
+ vpsrlq \$14,$T4,$T3
+
+ vpsrlq \$26,$H0,$D0
+ vpandq $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$40,$T4,$T4 # 4
+
+ vpsrlq \$26,$H3,$D3
+ vpandq $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpandq $MASK,$T0,$T0 # 0
+ #vpandq $MASK,$T1,$T1 # 1
+ #vpandq $MASK,$T3,$T3 # 3
+ #vporq $PADBIT,$T4,$T4 # padbit, yes, always
+
+ sub \$128,$len
+ ja .Loop_avx512
+
+.Ltail_avx512:
+ ################################################################
+ # while above multiplications were by r^8 in all lanes, in last
+ # iteration we multiply least significant lane by r^8 and most
+ # significant one by r, that's why table gets shifted...
+
+ vpsrlq \$32,$R0,$R0 # 0105020603070408
+ vpsrlq \$32,$R1,$R1
+ vpsrlq \$32,$R2,$R2
+ vpsrlq \$32,$S3,$S3
+ vpsrlq \$32,$S4,$S4
+ vpsrlq \$32,$R3,$R3
+ vpsrlq \$32,$R4,$R4
+ vpsrlq \$32,$S1,$S1
+ vpsrlq \$32,$S2,$S2
+
+ ################################################################
+ # load either next or last 64 byte of input
+ lea ($inp,$len),$inp
+
+ #vpaddq $H2,$T2,$H2 # accumulate input
+ vpaddq $H0,$T0,$H0
+
+ vpmuludq $H2,$R1,$D3 # d3 = h2*r1
+ vpmuludq $H2,$R2,$D4 # d4 = h2*r2
+ vpmuludq $H2,$S3,$D0 # d0 = h2*s3
+ vpandq $MASK,$T1,$T1 # 1
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+ vpandq $MASK,$T3,$T3 # 3
+ vpmuludq $H2,$R0,$D2 # d2 = h2*r0
+ vporq $PADBIT,$T4,$T4 # padbit, yes, always
+ vpaddq $H1,$T1,$H1 # accumulate input
+ vpaddq $H3,$T3,$H3
+ vpaddq $H4,$T4,$H4
+
+ vmovdqu 16*0($inp),%x#$T0
+ vpmuludq $H0,$R3,$M3
+ vpmuludq $H0,$R4,$M4
+ vpmuludq $H0,$R0,$M0
+ vpmuludq $H0,$R1,$M1
+ vpaddq $M3,$D3,$D3 # d3 += h0*r3
+ vpaddq $M4,$D4,$D4 # d4 += h0*r4
+ vpaddq $M0,$D0,$D0 # d0 += h0*r0
+ vpaddq $M1,$D1,$D1 # d1 += h0*r1
+
+ vmovdqu 16*1($inp),%x#$T1
+ vpmuludq $H1,$R2,$M3
+ vpmuludq $H1,$R3,$M4
+ vpmuludq $H1,$S4,$M0
+ vpmuludq $H0,$R2,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h1*r2
+ vpaddq $M4,$D4,$D4 # d4 += h1*r3
+ vpaddq $M0,$D0,$D0 # d0 += h1*s4
+ vpaddq $M2,$D2,$D2 # d2 += h0*r2
+
+ vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
+ vpmuludq $H3,$R0,$M3
+ vpmuludq $H3,$R1,$M4
+ vpmuludq $H1,$R0,$M1
+ vpmuludq $H1,$R1,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h3*r0
+ vpaddq $M4,$D4,$D4 # d4 += h3*r1
+ vpaddq $M1,$D1,$D1 # d1 += h1*r0
+ vpaddq $M2,$D2,$D2 # d2 += h1*r1
+
+ vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
+ vpmuludq $H4,$S4,$M3
+ vpmuludq $H4,$R0,$M4
+ vpmuludq $H3,$S2,$M0
+ vpmuludq $H3,$S3,$M1
+ vpmuludq $H3,$S4,$M2
+ vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
+ vpaddq $M4,$D4,$D4 # d4 += h4*r0
+ vpaddq $M0,$D0,$D0 # d0 += h3*s2
+ vpaddq $M1,$D1,$D1 # d1 += h3*s3
+ vpaddq $M2,$D2,$D2 # d2 += h3*s4
+
+ vpmuludq $H4,$S1,$M0
+ vpmuludq $H4,$S2,$M1
+ vpmuludq $H4,$S3,$M2
+ vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
+ vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
+ vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
+
+ ################################################################
+ # horizontal addition
+
+ mov \$1,%eax
+ vpermq \$0xb1,$H3,$D3
+ vpermq \$0xb1,$D4,$H4
+ vpermq \$0xb1,$H0,$D0
+ vpermq \$0xb1,$H1,$D1
+ vpermq \$0xb1,$H2,$D2
+ vpaddq $D3,$H3,$H3
+ vpaddq $D4,$H4,$H4
+ vpaddq $D0,$H0,$H0
+ vpaddq $D1,$H1,$H1
+ vpaddq $D2,$H2,$H2
+
+ kmovw %eax,%k3
+ vpermq \$0x2,$H3,$D3
+ vpermq \$0x2,$H4,$D4
+ vpermq \$0x2,$H0,$D0
+ vpermq \$0x2,$H1,$D1
+ vpermq \$0x2,$H2,$D2
+ vpaddq $D3,$H3,$H3
+ vpaddq $D4,$H4,$H4
+ vpaddq $D0,$H0,$H0
+ vpaddq $D1,$H1,$H1
+ vpaddq $D2,$H2,$H2
+
+ vextracti64x4 \$0x1,$H3,%y#$D3
+ vextracti64x4 \$0x1,$H4,%y#$D4
+ vextracti64x4 \$0x1,$H0,%y#$D0
+ vextracti64x4 \$0x1,$H1,%y#$D1
+ vextracti64x4 \$0x1,$H2,%y#$D2
+ vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
+ vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
+ vpaddq $D0,$H0,${H0}{%k3}{z}
+ vpaddq $D1,$H1,${H1}{%k3}{z}
+ vpaddq $D2,$H2,${H2}{%k3}{z}
+___
+map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
+map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
+$code.=<<___;
+ ################################################################
+ # lazy reduction (interleaved with input splat)
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpsrldq \$6,$T1,$T3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpunpcklqdq $T3,$T2,$T2 # 2:3
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D4
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpsrlq \$30,$T2,$T3
+ vpsrlq \$4,$T2,$T2
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpsrlq \$26,$T0,$T1
+ vpsrlq \$40,$T4,$T4 # 4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T0,$T0 # 0
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
+ vpand $MASK,$T1,$T1 # 1
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
+ add \$64,$len
+ jnz .Ltail_avx2$suffix
+
+ vpsubq $T2,$H2,$H2 # undo input accumulation
+ vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
+ vmovd %x#$H1,`4*1-48-64`($ctx)
+ vmovd %x#$H2,`4*2-48-64`($ctx)
+ vmovd %x#$H3,`4*3-48-64`($ctx)
+ vmovd %x#$H4,`4*4-48-64`($ctx)
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movdqa -0xb0(%r10),%xmm6
+ movdqa -0xa0(%r10),%xmm7
+ movdqa -0x90(%r10),%xmm8
+ movdqa -0x80(%r10),%xmm9
+ movdqa -0x70(%r10),%xmm10
+ movdqa -0x60(%r10),%xmm11
+ movdqa -0x50(%r10),%xmm12
+ movdqa -0x40(%r10),%xmm13
+ movdqa -0x30(%r10),%xmm14
+ movdqa -0x20(%r10),%xmm15
+ lea -8(%r10),%rsp
+.Ldo_avx512_epilogue:
+___
+$code.=<<___ if (!$win64);
+ lea -8(%r10),%rsp
+.cfi_def_cfa_register %rsp
+___
+$code.=<<___;
+ RET
+.cfi_endproc
+___
+
+}
+
+}
+
+&declare_function("poly1305_blocks_avx2", 32, 4);
+poly1305_blocks_avxN(0);
+&end_function("poly1305_blocks_avx2");
+
+#######################################################################
+if ($avx>2) {
+# On entry we have input length divisible by 64. But since inner loop
+# processes 128 bytes per iteration, cases when length is not divisible
+# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
+# reason stack layout is kept identical to poly1305_blocks_avx2. If not
+# for this tail, we wouldn't have to even allocate stack frame...
+
+&declare_function("poly1305_blocks_avx512", 32, 4);
+poly1305_blocks_avxN(1);
+&end_function("poly1305_blocks_avx512");
+
+if (!$kernel && $avx>3) {
+########################################################################
+# VPMADD52 version using 2^44 radix.
+#
+# One can argue that base 2^52 would be more natural. Well, even though
+# some operations would be more natural, one has to recognize couple of
+# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
+# at amount of multiply-n-accumulate operations. Secondly, it makes it
+# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
+# reference implementations], which means that more such operations
+# would have to be performed in inner loop, which in turn makes critical
+# path longer. In other words, even though base 2^44 reduction might
+# look less elegant, overall critical path is actually shorter...
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int64 h[3]; # current hash value base 2^44
+# unsigned __int64 s[2]; # key value*20 base 2^44
+# unsigned __int64 r[3]; # key value base 2^44
+# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
+# # r^n positions reflect
+# # placement in register, not
+# # memory, R[3] is R[1]*20
+
+$code.=<<___;
+.type poly1305_init_base2_44,\@function,3
+.align 32
+poly1305_init_base2_44:
+ xor %eax,%eax
+ mov %rax,0($ctx) # initialize hash value
+ mov %rax,8($ctx)
+ mov %rax,16($ctx)
+
+.Linit_base2_44:
+ lea poly1305_blocks_vpmadd52(%rip),%r10
+ lea poly1305_emit_base2_44(%rip),%r11
+
+ mov \$0x0ffffffc0fffffff,%rax
+ mov \$0x0ffffffc0ffffffc,%rcx
+ and 0($inp),%rax
+ mov \$0x00000fffffffffff,%r8
+ and 8($inp),%rcx
+ mov \$0x00000fffffffffff,%r9
+ and %rax,%r8
+ shrd \$44,%rcx,%rax
+ mov %r8,40($ctx) # r0
+ and %r9,%rax
+ shr \$24,%rcx
+ mov %rax,48($ctx) # r1
+ lea (%rax,%rax,4),%rax # *5
+ mov %rcx,56($ctx) # r2
+ shl \$2,%rax # magic <<2
+ lea (%rcx,%rcx,4),%rcx # *5
+ shl \$2,%rcx # magic <<2
+ mov %rax,24($ctx) # s1
+ mov %rcx,32($ctx) # s2
+ movq \$-1,64($ctx) # write impossible value
+___
+$code.=<<___ if ($flavour !~ /elf32/);
+ mov %r10,0(%rdx)
+ mov %r11,8(%rdx)
+___
+$code.=<<___ if ($flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+___
+$code.=<<___;
+ mov \$1,%eax
+ RET
+.size poly1305_init_base2_44,.-poly1305_init_base2_44
+___
+{
+my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
+my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
+my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52,\@function,4
+.align 32
+poly1305_blocks_vpmadd52:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52 # too short
+
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+ # if powers of the key are not calculated yet, process up to 3
+ # blocks with this single-block subroutine, otherwise ensure that
+ # length is divisible by 2 blocks and pass the rest down to next
+ # subroutine...
+
+ mov \$3,%rax
+ mov \$1,%r10
+ cmp \$4,$len # is input long
+ cmovae %r10,%rax
+ test %r8,%r8 # is power value impossible?
+ cmovns %r10,%rax
+
+ and $len,%rax # is input of favourable length?
+ jz .Lblocks_vpmadd52_4x
+
+ sub %rax,$len
+ mov \$7,%r10d
+ mov \$1,%r11d
+ kmovw %r10d,%k7
+ lea .L2_44_inp_permd(%rip),%r10
+ kmovw %r11d,%k1
+
+ vmovq $padbit,%x#$PAD
+ vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
+ vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
+ vpermq \$0xcf,$PAD,$PAD
+ vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
+
+ vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
+ vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
+ vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
+ vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
+
+ vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
+ vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
+
+ jmp .Loop_vpmadd52
+
+.align 32
+.Loop_vpmadd52:
+ vmovdqu32 0($inp),%x#$T0 # load input as ----3210
+ lea 16($inp),$inp
+
+ vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
+ vpsrlvq $inp_shift,$T0,$T0
+ vpandq $reduc_mask,$T0,$T0
+ vporq $PAD,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo # accumulate input
+
+ vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
+ vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
+ vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
+
+ vpxord $Dlo,$Dlo,$Dlo
+ vpxord $Dhi,$Dhi,$Dhi
+
+ vpmadd52luq $r2r1r0,$H0,$Dlo
+ vpmadd52huq $r2r1r0,$H0,$Dhi
+
+ vpmadd52luq $r1r0s2,$H1,$Dlo
+ vpmadd52huq $r1r0s2,$H1,$Dhi
+
+ vpmadd52luq $r0s2s1,$H2,$Dlo
+ vpmadd52huq $r0s2s1,$H2,$Dhi
+
+ vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
+ vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
+ vpandq $reduc_mask,$Dlo,$Dlo
+
+ vpaddq $T0,$Dhi,$Dhi
+
+ vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
+
+ vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
+
+ vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
+ vpandq $reduc_mask,$Dlo,$Dlo
+
+ vpermq \$0b10010011,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo
+
+ vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
+
+ vpaddq $T0,$Dlo,$Dlo
+ vpsllq \$2,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo
+
+ dec %rax # len-=16
+ jnz .Loop_vpmadd52
+
+ vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
+
+ test $len,$len
+ jnz .Lblocks_vpmadd52_4x
+
+.Lno_data_vpmadd52:
+ RET
+.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
+___
+}
+{
+########################################################################
+# As implied by its name 4x subroutine processes 4 blocks in parallel
+# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
+# and is handled in 256-bit %ymm registers.
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52_4x,\@function,4
+.align 32
+poly1305_blocks_vpmadd52_4x:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52_4x # too short
+
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+.Lblocks_vpmadd52_4x:
+ vpbroadcastq $padbit,$PAD
+
+ vmovdqa64 .Lx_mask44(%rip),$mask44
+ mov \$5,%eax
+ vmovdqa64 .Lx_mask42(%rip),$mask42
+ kmovw %eax,%k1 # used in 2x path
+
+ test %r8,%r8 # is power value impossible?
+ js .Linit_vpmadd52 # if it is, then init R[4]
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+ test \$3,$len # is length 4*n+2?
+ jnz .Lblocks_vpmadd52_2x_do
+
+.Lblocks_vpmadd52_4x_do:
+ vpbroadcastq 64($ctx),$R0 # load 4th power of the key
+ vpbroadcastq 96($ctx),$R1
+ vpbroadcastq 128($ctx),$R2
+ vpbroadcastq 160($ctx),$S1
+
+.Lblocks_vpmadd52_4x_key_loaded:
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ test \$7,$len # is len 8*n?
+ jz .Lblocks_vpmadd52_8x
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*2($inp),$T3
+ lea 16*4($inp),$inp
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as 3-1-2-0
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ sub \$4,$len
+ jz .Ltail_vpmadd52_4x
+ jmp .Loop_vpmadd52_4x
+ ud2
+
+.align 32
+.Linit_vpmadd52:
+ vmovq 24($ctx),%x#$S1 # load key
+ vmovq 56($ctx),%x#$H2
+ vmovq 32($ctx),%x#$S2
+ vmovq 40($ctx),%x#$R0
+ vmovq 48($ctx),%x#$R1
+
+ vmovdqa $R0,$H0
+ vmovdqa $R1,$H1
+ vmovdqa $H2,$R2
+
+ mov \$2,%eax
+
+.Lmul_init_vpmadd52:
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ dec %eax
+ jz .Ldone_init_vpmadd52
+
+ vpunpcklqdq $R1,$H1,$R1 # 1,2
+ vpbroadcastq %x#$H1,%x#$H1 # 2,2
+ vpunpcklqdq $R2,$H2,$R2
+ vpbroadcastq %x#$H2,%x#$H2
+ vpunpcklqdq $R0,$H0,$R0
+ vpbroadcastq %x#$H0,%x#$H0
+
+ vpsllq \$2,$R1,$S1 # S1 = R1*5*4
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R1,$S1,$S1
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S1,$S1
+ vpsllq \$2,$S2,$S2
+
+ jmp .Lmul_init_vpmadd52
+ ud2
+
+.align 32
+.Ldone_init_vpmadd52:
+ vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
+ vinserti128 \$1,%x#$R2,$H2,$R2
+ vinserti128 \$1,%x#$R0,$H0,$R0
+
+ vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
+ vpermq \$0b11011000,$R2,$R2
+ vpermq \$0b11011000,$R0,$R0
+
+ vpsllq \$2,$R1,$S1 # S1 = R1*5*4
+ vpaddq $R1,$S1,$S1
+ vpsllq \$2,$S1,$S1
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+ test \$3,$len # is length 4*n+2?
+ jnz .Ldone_init_vpmadd52_2x
+
+ vmovdqu64 $R0,64($ctx) # save key powers
+ vpbroadcastq %x#$R0,$R0 # broadcast 4th power
+ vmovdqu64 $R1,96($ctx)
+ vpbroadcastq %x#$R1,$R1
+ vmovdqu64 $R2,128($ctx)
+ vpbroadcastq %x#$R2,$R2
+ vmovdqu64 $S1,160($ctx)
+ vpbroadcastq %x#$S1,$S1
+
+ jmp .Lblocks_vpmadd52_4x_key_loaded
+ ud2
+
+.align 32
+.Ldone_init_vpmadd52_2x:
+ vmovdqu64 $R0,64($ctx) # save key powers
+ vpsrldq \$8,$R0,$R0 # 0-1-0-2
+ vmovdqu64 $R1,96($ctx)
+ vpsrldq \$8,$R1,$R1
+ vmovdqu64 $R2,128($ctx)
+ vpsrldq \$8,$R2,$R2
+ vmovdqu64 $S1,160($ctx)
+ vpsrldq \$8,$S1,$S1
+ jmp .Lblocks_vpmadd52_2x_key_loaded
+ ud2
+
+.align 32
+.Lblocks_vpmadd52_2x_do:
+ vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
+ vmovdqu64 160+8($ctx),${S1}{%k1}{z}
+ vmovdqu64 64+8($ctx),${R0}{%k1}{z}
+ vmovdqu64 96+8($ctx),${R1}{%k1}{z}
+
+.Lblocks_vpmadd52_2x_key_loaded:
+ vmovdqu64 16*0($inp),$T2 # load data
+ vpxorq $T3,$T3,$T3
+ lea 16*2($inp),$inp
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as x-1-x-0
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ jmp .Ltail_vpmadd52_2x
+ ud2
+
+.align 32
+.Loop_vpmadd52_4x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*2($inp),$T3
+ lea 16*4($inp),$inp
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction (interleaved with data splat)
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpsrlq \$24,$T3,$T2
+ vporq $PAD,$T2,$T2
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ sub \$4,$len # len-=64
+ jnz .Loop_vpmadd52_4x
+
+.Ltail_vpmadd52_4x:
+ vmovdqu64 128($ctx),$R2 # load all key powers
+ vmovdqu64 160($ctx),$S1
+ vmovdqu64 64($ctx),$R0
+ vmovdqu64 96($ctx),$R1
+
+.Ltail_vpmadd52_2x:
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # horizontal addition
+
+ mov \$1,%eax
+ kmovw %eax,%k1
+ vpsrldq \$8,$D0lo,$T0
+ vpsrldq \$8,$D0hi,$H0
+ vpsrldq \$8,$D1lo,$T1
+ vpsrldq \$8,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpsrldq \$8,$D2lo,$T2
+ vpsrldq \$8,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vpermq \$0x2,$D0lo,$T0
+ vpermq \$0x2,$D0hi,$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vpermq \$0x2,$D1lo,$T1
+ vpermq \$0x2,$D1hi,$H1
+ vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
+ vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
+ vpermq \$0x2,$D2lo,$T2
+ vpermq \$0x2,$D2hi,$H2
+ vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
+ vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
+ vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
+ vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+ # at this point $len is
+ # either 4*n+2 or 0...
+ sub \$2,$len # len-=32
+ ja .Lblocks_vpmadd52_4x_do
+
+ vmovq %x#$H0,0($ctx)
+ vmovq %x#$H1,8($ctx)
+ vmovq %x#$H2,16($ctx)
+ vzeroall
+
+.Lno_data_vpmadd52_4x:
+ RET
+.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
+___
+}
+{
+########################################################################
+# As implied by its name 8x subroutine processes 8 blocks in parallel...
+# This is intermediate version, as it's used only in cases when input
+# length is either 8*n, 8*n+1 or 8*n+2...
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52_8x,\@function,4
+.align 32
+poly1305_blocks_vpmadd52_8x:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52_8x # too short
+
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+ vmovdqa64 .Lx_mask44(%rip),$mask44
+ vmovdqa64 .Lx_mask42(%rip),$mask42
+
+ test %r8,%r8 # is power value impossible?
+ js .Linit_vpmadd52 # if it is, then init R[4]
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+.Lblocks_vpmadd52_8x:
+ ################################################################
+ # fist we calculate more key powers
+
+ vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
+ vmovdqu64 160($ctx),$S1
+ vmovdqu64 64($ctx),$R0
+ vmovdqu64 96($ctx),$R1
+
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
+ vpbroadcastq %x#$R0,$RR0
+ vpbroadcastq %x#$R1,$RR1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $RR2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $RR2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $RR2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $RR2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $RR2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $RR2,$R0,$D2hi
+
+ vpmadd52luq $RR0,$R0,$D0lo
+ vpmadd52huq $RR0,$R0,$D0hi
+ vpmadd52luq $RR0,$R1,$D1lo
+ vpmadd52huq $RR0,$R1,$D1hi
+ vpmadd52luq $RR0,$R2,$D2lo
+ vpmadd52huq $RR0,$R2,$D2hi
+
+ vpmadd52luq $RR1,$S2,$D0lo
+ vpmadd52huq $RR1,$S2,$D0hi
+ vpmadd52luq $RR1,$R0,$D1lo
+ vpmadd52huq $RR1,$R0,$D1hi
+ vpmadd52luq $RR1,$R1,$D2lo
+ vpmadd52huq $RR1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$RR0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$RR1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$RR2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$RR0,$RR0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$RR0,$RR0
+
+ vpsrlq \$44,$RR0,$tmp # additional step
+ vpandq $mask44,$RR0,$RR0
+
+ vpaddq $tmp,$RR1,$RR1
+
+ ################################################################
+ # At this point Rx holds 1324 powers, RRx - 5768, and the goal
+ # is 15263748, which reflects how data is loaded...
+
+ vpunpcklqdq $R2,$RR2,$T2 # 3748
+ vpunpckhqdq $R2,$RR2,$R2 # 1526
+ vpunpcklqdq $R0,$RR0,$T0
+ vpunpckhqdq $R0,$RR0,$R0
+ vpunpcklqdq $R1,$RR1,$T1
+ vpunpckhqdq $R1,$RR1,$R1
+___
+######## switch to %zmm
+map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
+
+$code.=<<___;
+ vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
+ vshufi64x2 \$0x44,$R0,$T0,$RR0
+ vshufi64x2 \$0x44,$R1,$T1,$RR1
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*4($inp),$T3
+ lea 16*8($inp),$inp
+
+ vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
+ vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
+ vpaddq $RR2,$SS2,$SS2
+ vpaddq $RR1,$SS1,$SS1
+ vpsllq \$2,$SS2,$SS2
+ vpsllq \$2,$SS1,$SS1
+
+ vpbroadcastq $padbit,$PAD
+ vpbroadcastq %x#$mask44,$mask44
+ vpbroadcastq %x#$mask42,$mask42
+
+ vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
+ vpbroadcastq %x#$SS2,$S2
+ vpbroadcastq %x#$RR0,$R0
+ vpbroadcastq %x#$RR1,$R1
+ vpbroadcastq %x#$RR2,$R2
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as 73625140
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ sub \$8,$len
+ jz .Ltail_vpmadd52_8x
+ jmp .Loop_vpmadd52_8x
+
+.align 32
+.Loop_vpmadd52_8x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*4($inp),$T3
+ lea 16*8($inp),$inp
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction (interleaved with data splat)
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpsrlq \$24,$T3,$T2
+ vporq $PAD,$T2,$T2
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ sub \$8,$len # len-=128
+ jnz .Loop_vpmadd52_8x
+
+.Ltail_vpmadd52_8x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$SS1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$SS1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$SS2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$SS2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$RR0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$RR0,$D2hi
+
+ vpmadd52luq $H0,$RR0,$D0lo
+ vpmadd52huq $H0,$RR0,$D0hi
+ vpmadd52luq $H0,$RR1,$D1lo
+ vpmadd52huq $H0,$RR1,$D1hi
+ vpmadd52luq $H0,$RR2,$D2lo
+ vpmadd52huq $H0,$RR2,$D2hi
+
+ vpmadd52luq $H1,$SS2,$D0lo
+ vpmadd52huq $H1,$SS2,$D0hi
+ vpmadd52luq $H1,$RR0,$D1lo
+ vpmadd52huq $H1,$RR0,$D1hi
+ vpmadd52luq $H1,$RR1,$D2lo
+ vpmadd52huq $H1,$RR1,$D2hi
+
+ ################################################################
+ # horizontal addition
+
+ mov \$1,%eax
+ kmovw %eax,%k1
+ vpsrldq \$8,$D0lo,$T0
+ vpsrldq \$8,$D0hi,$H0
+ vpsrldq \$8,$D1lo,$T1
+ vpsrldq \$8,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpsrldq \$8,$D2lo,$T2
+ vpsrldq \$8,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vpermq \$0x2,$D0lo,$T0
+ vpermq \$0x2,$D0hi,$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vpermq \$0x2,$D1lo,$T1
+ vpermq \$0x2,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpermq \$0x2,$D2lo,$T2
+ vpermq \$0x2,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vextracti64x4 \$1,$D0lo,%y#$T0
+ vextracti64x4 \$1,$D0hi,%y#$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vextracti64x4 \$1,$D1lo,%y#$T1
+ vextracti64x4 \$1,$D1hi,%y#$H1
+ vextracti64x4 \$1,$D2lo,%y#$T2
+ vextracti64x4 \$1,$D2hi,%y#$H2
+___
+######## switch back to %ymm
+map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+
+$code.=<<___;
+ vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
+ vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
+ vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
+ vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
+ vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
+ vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ ################################################################
+
+ vmovq %x#$H0,0($ctx)
+ vmovq %x#$H1,8($ctx)
+ vmovq %x#$H2,16($ctx)
+ vzeroall
+
+.Lno_data_vpmadd52_8x:
+ RET
+.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
+___
+}
+$code.=<<___;
+.type poly1305_emit_base2_44,\@function,3
+.align 32
+poly1305_emit_base2_44:
+ mov 0($ctx),%r8 # load hash value
+ mov 8($ctx),%r9
+ mov 16($ctx),%r10
+
+ mov %r9,%rax
+ shr \$20,%r9
+ shl \$44,%rax
+ mov %r10,%rcx
+ shr \$40,%r10
+ shl \$24,%rcx
+
+ add %rax,%r8
+ adc %rcx,%r9
+ adc \$0,%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overflow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ RET
+.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
+___
+} } }
+}
+
+if (!$kernel)
+{ # chacha20-poly1305 helpers
+my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+$code.=<<___;
+.globl xor128_encrypt_n_pad
+.type xor128_encrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_encrypt_n_pad:
+ sub $otp,$inp
+ sub $otp,$out
+ mov $len,%r10 # put len aside
+ shr \$4,$len # len / 16
+ jz .Ltail_enc
+ nop
+.Loop_enc_xmm:
+ movdqu ($inp,$otp),%xmm0
+ pxor ($otp),%xmm0
+ movdqu %xmm0,($out,$otp)
+ movdqa %xmm0,($otp)
+ lea 16($otp),$otp
+ dec $len
+ jnz .Loop_enc_xmm
+
+ and \$15,%r10 # len % 16
+ jz .Ldone_enc
+
+.Ltail_enc:
+ mov \$16,$len
+ sub %r10,$len
+ xor %eax,%eax
+.Loop_enc_byte:
+ mov ($inp,$otp),%al
+ xor ($otp),%al
+ mov %al,($out,$otp)
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec %r10
+ jnz .Loop_enc_byte
+
+ xor %eax,%eax
+.Loop_enc_pad:
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec $len
+ jnz .Loop_enc_pad
+
+.Ldone_enc:
+ mov $otp,%rax
+ RET
+.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
+
+.globl xor128_decrypt_n_pad
+.type xor128_decrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_decrypt_n_pad:
+ sub $otp,$inp
+ sub $otp,$out
+ mov $len,%r10 # put len aside
+ shr \$4,$len # len / 16
+ jz .Ltail_dec
+ nop
+.Loop_dec_xmm:
+ movdqu ($inp,$otp),%xmm0
+ movdqa ($otp),%xmm1
+ pxor %xmm0,%xmm1
+ movdqu %xmm1,($out,$otp)
+ movdqa %xmm0,($otp)
+ lea 16($otp),$otp
+ dec $len
+ jnz .Loop_dec_xmm
+
+ pxor %xmm1,%xmm1
+ and \$15,%r10 # len % 16
+ jz .Ldone_dec
+
+.Ltail_dec:
+ mov \$16,$len
+ sub %r10,$len
+ xor %eax,%eax
+ xor %r11d,%r11d
+.Loop_dec_byte:
+ mov ($inp,$otp),%r11b
+ mov ($otp),%al
+ xor %r11b,%al
+ mov %al,($out,$otp)
+ mov %r11b,($otp)
+ lea 1($otp),$otp
+ dec %r10
+ jnz .Loop_dec_byte
+
+ xor %eax,%eax
+.Loop_dec_pad:
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec $len
+ jnz .Loop_dec_pad
+
+.Ldone_dec:
+ mov $otp,%rax
+ RET
+.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lcommon_seh_tail
+
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R14
+
+ jmp .Lcommon_seh_tail
+.size se_handler,.-se_handler
+
+.type avx_handler,\@abi-omnipotent
+.align 16
+avx_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 208($context),%rax # pull context->R11
+
+ lea 0x50(%rax),%rsi
+ lea 0xf8(%rax),%rax
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ RET
+.size avx_handler,.-avx_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_poly1305_init_x86_64
+ .rva .LSEH_end_poly1305_init_x86_64
+ .rva .LSEH_info_poly1305_init_x86_64
+
+ .rva .LSEH_begin_poly1305_blocks_x86_64
+ .rva .LSEH_end_poly1305_blocks_x86_64
+ .rva .LSEH_info_poly1305_blocks_x86_64
+
+ .rva .LSEH_begin_poly1305_emit_x86_64
+ .rva .LSEH_end_poly1305_emit_x86_64
+ .rva .LSEH_info_poly1305_emit_x86_64
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_poly1305_blocks_avx
+ .rva .Lbase2_64_avx
+ .rva .LSEH_info_poly1305_blocks_avx_1
+
+ .rva .Lbase2_64_avx
+ .rva .Leven_avx
+ .rva .LSEH_info_poly1305_blocks_avx_2
+
+ .rva .Leven_avx
+ .rva .LSEH_end_poly1305_blocks_avx
+ .rva .LSEH_info_poly1305_blocks_avx_3
+
+ .rva .LSEH_begin_poly1305_emit_avx
+ .rva .LSEH_end_poly1305_emit_avx
+ .rva .LSEH_info_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_poly1305_blocks_avx2
+ .rva .Lbase2_64_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_1
+
+ .rva .Lbase2_64_avx2
+ .rva .Leven_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_2
+
+ .rva .Leven_avx2
+ .rva .LSEH_end_poly1305_blocks_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_3
+___
+$code.=<<___ if ($avx>2);
+ .rva .LSEH_begin_poly1305_blocks_avx512
+ .rva .LSEH_end_poly1305_blocks_avx512
+ .rva .LSEH_info_poly1305_blocks_avx512
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_poly1305_init_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
+
+.LSEH_info_poly1305_blocks_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_body,.Lblocks_epilogue
+
+.LSEH_info_poly1305_emit_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
+___
+$code.=<<___ if ($avx);
+.LSEH_info_poly1305_blocks_avx_1:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_2:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_3:
+ .byte 9,0,0,0
+ .rva avx_handler
+ .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_emit_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_poly1305_blocks_avx2_1:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_2:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_3:
+ .byte 9,0,0,0
+ .rva avx_handler
+ .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
+___
+$code.=<<___ if ($avx>2);
+.LSEH_info_poly1305_blocks_avx512:
+ .byte 9,0,0,0
+ .rva avx_handler
+ .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
+___
+}
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+foreach (split('\n',$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+ s/%r([a-z]+)#d/%e$1/g;
+ s/%r([0-9]+)#d/%r$1d/g;
+ s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
+
+ if ($kernel) {
+ s/(^\.type.*),[0-9]+$/\1/;
+ s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
+ next if /^\.cfi.*/;
+ }
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/lib/crypto/x86/poly1305.h b/lib/crypto/x86/poly1305.h
new file mode 100644
index 000000000000..ee92e3740a78
--- /dev/null
+++ b/lib/crypto/x86/poly1305.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/cpu_device_id.h>
+#include <asm/fpu/api.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+struct poly1305_arch_internal {
+ union {
+ struct {
+ u32 h[5];
+ u32 is_base2_26;
+ };
+ u64 hs[3];
+ };
+ u64 r[2];
+ u64 pad;
+ struct { u32 r2, r1, r4, r3; } rn[9];
+};
+
+/*
+ * The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
+ * the unfortunate situation of using AVX and then having to go back to scalar
+ * -- because the user is silly and has called the update function from two
+ * separate contexts -- then we need to convert back to the original base before
+ * proceeding. It is possible to reason that the initial reduction below is
+ * sufficient given the implementation invariants. However, for an avoidance of
+ * doubt and because this is not performance critical, we do the full reduction
+ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
+ */
+static void convert_to_base2_64(void *ctx)
+{
+ struct poly1305_arch_internal *state = ctx;
+ u32 cy;
+
+ if (!state->is_base2_26)
+ return;
+
+ cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+ cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+ cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+ cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+ state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+ state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+ state->hs[2] = state->h[4] >> 24;
+ /* Unsigned Less Than: branchlessly produces 1 if a < b, else 0. */
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
+ state->hs[2] &= 3;
+ state->hs[0] += cy;
+ state->hs[1] += (cy = ULT(state->hs[0], cy));
+ state->hs[2] += ULT(state->hs[1], cy);
+#undef ULT
+ state->is_base2_26 = 0;
+}
+
+asmlinkage void poly1305_init_x86_64(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx,
+ const u8 *inp,
+ const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx,
+ u8 mac[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx,
+ u8 mac[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx,
+ const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx,
+ const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx,
+ const u8 *inp,
+ const size_t len, const u32 padbit);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
+
+static void poly1305_block_init(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE])
+{
+ poly1305_init_x86_64(state, raw_key);
+}
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *inp,
+ unsigned int len, u32 padbit)
+{
+ struct poly1305_arch_internal *ctx =
+ container_of(&state->h.h, struct poly1305_arch_internal, h);
+
+ /* SIMD disables preemption, so relax after processing each page. */
+ BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
+ SZ_4K % POLY1305_BLOCK_SIZE);
+
+ /*
+ * The AVX implementations have significant setup overhead (e.g. key
+ * power computation, kernel FPU enabling) which makes them slower for
+ * short messages. Fall back to the scalar implementation for messages
+ * shorter than 288 bytes, unless the AVX-specific key setup has already
+ * been performed (indicated by ctx->is_base2_26).
+ */
+ if (!static_branch_likely(&poly1305_use_avx) ||
+ (len < POLY1305_BLOCK_SIZE * 18 && !ctx->is_base2_26) ||
+ unlikely(!irq_fpu_usable())) {
+ convert_to_base2_64(ctx);
+ poly1305_blocks_x86_64(ctx, inp, len, padbit);
+ return;
+ }
+
+ do {
+ const unsigned int bytes = min(len, SZ_4K);
+
+ kernel_fpu_begin();
+ if (static_branch_likely(&poly1305_use_avx512))
+ poly1305_blocks_avx512(ctx, inp, bytes, padbit);
+ else if (static_branch_likely(&poly1305_use_avx2))
+ poly1305_blocks_avx2(ctx, inp, bytes, padbit);
+ else
+ poly1305_blocks_avx(ctx, inp, bytes, padbit);
+ kernel_fpu_end();
+
+ len -= bytes;
+ inp += bytes;
+ } while (len);
+}
+
+static void poly1305_emit(const struct poly1305_state *ctx,
+ u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4])
+{
+ if (!static_branch_likely(&poly1305_use_avx))
+ poly1305_emit_x86_64(ctx, mac, nonce);
+ else
+ poly1305_emit_avx(ctx, mac, nonce);
+}
+
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ static_branch_enable(&poly1305_use_avx);
+ if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ static_branch_enable(&poly1305_use_avx2);
+ if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
+ /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
+ boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
+ static_branch_enable(&poly1305_use_avx512);
+}
diff --git a/lib/crypto/x86/polyval-pclmul-avx.S b/lib/crypto/x86/polyval-pclmul-avx.S
new file mode 100644
index 000000000000..7f739465ad35
--- /dev/null
+++ b/lib/crypto/x86/polyval-pclmul-avx.S
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
+ * instructions. It works on 8 blocks at a time, by precomputing the first 8
+ * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
+ * allows us to split finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STRIDE_BLOCKS 8
+
+#define GSTAR %xmm7
+#define PL %xmm8
+#define PH %xmm9
+#define TMP_XMM %xmm11
+#define LO %xmm12
+#define HI %xmm13
+#define MI %xmm14
+#define SUM %xmm15
+
+#define ACCUMULATOR %rdi
+#define KEY_POWERS %rsi
+#define MSG %rdx
+#define BLOCKS_LEFT %rcx
+#define TMP %rax
+
+.section .rodata.cst16.gstar, "aM", @progbits, 16
+.align 16
+
+.Lgstar:
+ .quad 0xc200000000000000, 0xc200000000000000
+
+.text
+
+/*
+ * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
+ * count pointed to by MSG and KEY_POWERS.
+ */
+.macro schoolbook1 count
+ .set i, 0
+ .rept (\count)
+ schoolbook1_iteration i 0
+ .set i, (i +1)
+ .endr
+.endm
+
+/*
+ * Computes the product of two 128-bit polynomials at the memory locations
+ * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
+ * the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ * X = [X_1 : X_0]
+ * Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ * LO += X_0 * Y_0
+ * MI += X_0 * Y_1 + X_1 * Y_0
+ * HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an
+ * extra multiplication of SUM and h^8.
+ */
+.macro schoolbook1_iteration i xor_sum
+ movups (16*\i)(MSG), %xmm0
+ .if (\i == 0 && \xor_sum == 1)
+ pxor SUM, %xmm0
+ .endif
+ vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
+ vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
+ vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
+ vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
+ vpxor %xmm2, MI, MI
+ vpxor %xmm1, LO, LO
+ vpxor %xmm4, HI, HI
+ vpxor %xmm3, MI, MI
+.endm
+
+/*
+ * Performs the same computation as schoolbook1_iteration, except we expect the
+ * arguments to already be loaded into xmm0 and xmm1 and we set the result
+ * registers LO, MI, and HI directly rather than XOR'ing into them.
+ */
+.macro schoolbook1_noload
+ vpclmulqdq $0x01, %xmm0, %xmm1, MI
+ vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
+ vpclmulqdq $0x00, %xmm0, %xmm1, LO
+ vpclmulqdq $0x11, %xmm0, %xmm1, HI
+ vpxor %xmm2, MI, MI
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ */
+.macro schoolbook2
+ vpslldq $8, MI, PL
+ vpsrldq $8, MI, PH
+ pxor LO, PL
+ pxor HI, PH
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form. We need to reduce it
+ * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128. To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ * T = T_1 : T_0 = g*(x) * P_0
+ * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+ vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
+ pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1
+ pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1
+ pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+ pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
+ vpxor TMP_XMM, PH, \dest
+.endm
+
+/*
+ * Compute schoolbook multiplication for 8 blocks
+ * m_0h^8 + ... + m_7h^1
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ */
+.macro full_stride reduce
+ pxor LO, LO
+ pxor HI, HI
+ pxor MI, MI
+
+ schoolbook1_iteration 7 0
+ .if \reduce
+ vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 6 0
+ .if \reduce
+ pshufd $0b01001110, TMP_XMM, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 5 0
+ .if \reduce
+ pxor PL, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 4 0
+ .if \reduce
+ pxor TMP_XMM, PH
+ .endif
+
+ schoolbook1_iteration 3 0
+ .if \reduce
+ pclmulqdq $0x11, GSTAR, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 2 0
+ .if \reduce
+ vpxor TMP_XMM, PH, SUM
+ .endif
+
+ schoolbook1_iteration 1 0
+
+ schoolbook1_iteration 0 1
+
+ addq $(8*16), MSG
+ schoolbook2
+.endm
+
+/*
+ * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
+ */
+.macro partial_stride
+ mov BLOCKS_LEFT, TMP
+ shlq $4, TMP
+ addq $(16*STRIDE_BLOCKS), KEY_POWERS
+ subq TMP, KEY_POWERS
+
+ movups (MSG), %xmm0
+ pxor SUM, %xmm0
+ movups (KEY_POWERS), %xmm1
+ schoolbook1_noload
+ dec BLOCKS_LEFT
+ addq $16, MSG
+ addq $16, KEY_POWERS
+
+ test $4, BLOCKS_LEFT
+ jz .Lpartial4BlocksDone
+ schoolbook1 4
+ addq $(4*16), MSG
+ addq $(4*16), KEY_POWERS
+.Lpartial4BlocksDone:
+ test $2, BLOCKS_LEFT
+ jz .Lpartial2BlocksDone
+ schoolbook1 2
+ addq $(2*16), MSG
+ addq $(2*16), KEY_POWERS
+.Lpartial2BlocksDone:
+ test $1, BLOCKS_LEFT
+ jz .LpartialDone
+ schoolbook1 1
+.LpartialDone:
+ schoolbook2
+ montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ * const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pclmul_avx)
+ FRAME_BEGIN
+ vmovdqa .Lgstar(%rip), GSTAR
+ movups (%rdi), %xmm0
+ movups (%rsi), %xmm1
+ schoolbook1_noload
+ schoolbook2
+ montgomery_reduction SUM
+ movups SUM, (%rdi)
+ FRAME_END
+ RET
+SYM_FUNC_END(polyval_mul_pclmul_avx)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL. This computes:
+ * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * rdi - pointer to the accumulator
+ * rsi - pointer to precomputed key powers h^8 ... h^1
+ * rdx - pointer to message blocks
+ * rcx - number of blocks to hash
+ *
+ * void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ * const struct polyval_key *key,
+ * const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pclmul_avx)
+ FRAME_BEGIN
+ vmovdqa .Lgstar(%rip), GSTAR
+ movups (ACCUMULATOR), SUM
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ js .LstrideLoopExit
+ full_stride 0
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ js .LstrideLoopExitReduce
+.LstrideLoop:
+ full_stride 1
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ jns .LstrideLoop
+.LstrideLoopExitReduce:
+ montgomery_reduction SUM
+.LstrideLoopExit:
+ add $STRIDE_BLOCKS, BLOCKS_LEFT
+ jz .LskipPartial
+ partial_stride
+.LskipPartial:
+ movups SUM, (ACCUMULATOR)
+ FRAME_END
+ RET
+SYM_FUNC_END(polyval_blocks_pclmul_avx)
diff --git a/lib/crypto/x86/polyval.h b/lib/crypto/x86/polyval.h
new file mode 100644
index 000000000000..ef8797521420
--- /dev/null
+++ b/lib/crypto/x86/polyval.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, x86_64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
+
+asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+ const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+ static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+ memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_pclmul_avx(
+ &key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ kernel_fpu_end();
+ } else {
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_generic(&key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ }
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+ const struct polyval_key *key)
+{
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ kernel_fpu_end();
+ } else {
+ polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ }
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ do {
+ /* Allow rescheduling every 4 KiB. */
+ size_t n = min_t(size_t, nblocks,
+ 4096 / POLYVAL_BLOCK_SIZE);
+
+ kernel_fpu_begin();
+ polyval_blocks_pclmul_avx(acc, key, data, n);
+ kernel_fpu_end();
+ data += n * POLYVAL_BLOCK_SIZE;
+ nblocks -= n;
+ } while (nblocks);
+ } else {
+ polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+ data, nblocks);
+ }
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
+ boot_cpu_has(X86_FEATURE_AVX))
+ static_branch_enable(&have_pclmul_avx);
+}
diff --git a/lib/crypto/x86/sha1-avx2-asm.S b/lib/crypto/x86/sha1-avx2-asm.S
new file mode 100644
index 000000000000..91b2dc6db179
--- /dev/null
+++ b/lib/crypto/x86/sha1-avx2-asm.S
@@ -0,0 +1,697 @@
+/*
+ * Implement fast SHA-1 with AVX2 instructions. (x86_64)
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Ilya Albrekht <ilya.albrekht@intel.com>
+ * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
+ *
+ *This implementation is based on the previous SSSE3 release:
+ *Visit http://software.intel.com/en-us/articles/
+ *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * void sha1_transform_avx2(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+
+#include <linux/linkage.h>
+
+#define CTX %rdi /* arg1 */
+#define BUF %rsi /* arg2 */
+#define CNT %rdx /* arg3 */
+
+#define REG_A %ecx
+#define REG_B %esi
+#define REG_C %edi
+#define REG_D %eax
+#define REG_E %edx
+#define REG_TB %ebx
+#define REG_TA %r12d
+#define REG_RA %rcx
+#define REG_RB %rsi
+#define REG_RC %rdi
+#define REG_RD %rax
+#define REG_RE %rdx
+#define REG_RTA %r12
+#define REG_RTB %rbx
+#define REG_T1 %r11d
+#define xmm_mov vmovups
+#define avx2_zeroupper vzeroupper
+#define RND_F1 1
+#define RND_F2 2
+#define RND_F3 3
+
+.macro REGALLOC
+ .set A, REG_A
+ .set B, REG_B
+ .set C, REG_C
+ .set D, REG_D
+ .set E, REG_E
+ .set TB, REG_TB
+ .set TA, REG_TA
+
+ .set RA, REG_RA
+ .set RB, REG_RB
+ .set RC, REG_RC
+ .set RD, REG_RD
+ .set RE, REG_RE
+
+ .set RTA, REG_RTA
+ .set RTB, REG_RTB
+
+ .set T1, REG_T1
+.endm
+
+#define HASH_PTR %r9
+#define BLOCKS_CTR %r8
+#define BUFFER_PTR %r10
+#define BUFFER_PTR2 %r13
+
+#define PRECALC_BUF %r14
+#define WK_BUF %r15
+
+#define W_TMP %xmm0
+#define WY_TMP %ymm0
+#define WY_TMP2 %ymm9
+
+# AVX2 variables
+#define WY0 %ymm3
+#define WY4 %ymm5
+#define WY08 %ymm7
+#define WY12 %ymm8
+#define WY16 %ymm12
+#define WY20 %ymm13
+#define WY24 %ymm14
+#define WY28 %ymm15
+
+#define YMM_SHUFB_BSWAP %ymm10
+
+/*
+ * Keep 2 iterations precalculated at a time:
+ * - 80 DWORDs per iteration * 2
+ */
+#define W_SIZE (80*2*2 +16)
+
+#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
+#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
+
+
+.macro UPDATE_HASH hash, val
+ add \hash, \val
+ mov \val, \hash
+.endm
+
+.macro PRECALC_RESET_WY
+ .set WY_00, WY0
+ .set WY_04, WY4
+ .set WY_08, WY08
+ .set WY_12, WY12
+ .set WY_16, WY16
+ .set WY_20, WY20
+ .set WY_24, WY24
+ .set WY_28, WY28
+ .set WY_32, WY_00
+.endm
+
+.macro PRECALC_ROTATE_WY
+ /* Rotate macros */
+ .set WY_32, WY_28
+ .set WY_28, WY_24
+ .set WY_24, WY_20
+ .set WY_20, WY_16
+ .set WY_16, WY_12
+ .set WY_12, WY_08
+ .set WY_08, WY_04
+ .set WY_04, WY_00
+ .set WY_00, WY_32
+
+ /* Define register aliases */
+ .set WY, WY_00
+ .set WY_minus_04, WY_04
+ .set WY_minus_08, WY_08
+ .set WY_minus_12, WY_12
+ .set WY_minus_16, WY_16
+ .set WY_minus_20, WY_20
+ .set WY_minus_24, WY_24
+ .set WY_minus_28, WY_28
+ .set WY_minus_32, WY
+.endm
+
+.macro PRECALC_00_15
+ .if (i == 0) # Initialize and rotate registers
+ PRECALC_RESET_WY
+ PRECALC_ROTATE_WY
+ .endif
+
+ /* message scheduling pre-compute for rounds 0-15 */
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ vmovdqu (i * 2)(BUFFER_PTR), W_TMP
+ .elseif ((i & 7) == 1)
+ vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
+ WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
+ .elseif ((i & 7) == 4)
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ .elseif ((i & 7) == 7)
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC_16_31
+ /*
+ * message scheduling pre-compute for rounds 16-31
+ * calculating last 32 w[i] values in 8 XMM registers
+ * pre-calculate K+w[i] values and store to mem
+ * for later load by ALU add instruction
+ *
+ * "brute force" vectorization for rounds 16-31 only
+ * due to w[i]->w[i-3] dependency
+ */
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ /* w[i-14] */
+ vpalignr $8, WY_minus_16, WY_minus_12, WY
+ vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
+ .elseif ((i & 7) == 1)
+ vpxor WY_minus_08, WY, WY
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpxor WY_TMP, WY, WY
+ vpslldq $12, WY, WY_TMP2
+ .elseif ((i & 7) == 3)
+ vpslld $1, WY, WY_TMP
+ vpsrld $31, WY, WY
+ .elseif ((i & 7) == 4)
+ vpor WY, WY_TMP, WY_TMP
+ vpslld $2, WY_TMP2, WY
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY_TMP2, WY_TMP2
+ vpxor WY, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 7)
+ vpxor WY_TMP2, WY_TMP, WY
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC_32_79
+ /*
+ * in SHA-1 specification:
+ * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal:
+ * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization
+ * since w[i]=>w[i-3] dependency is broken
+ */
+
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
+ .elseif ((i & 7) == 1)
+ /* W is W_minus_32 before xor */
+ vpxor WY_minus_28, WY, WY
+ .elseif ((i & 7) == 2)
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 3)
+ vpxor WY_TMP, WY, WY
+ .elseif ((i & 7) == 4)
+ vpslld $2, WY, WY_TMP
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY, WY
+ vpor WY, WY_TMP, WY
+ .elseif ((i & 7) == 7)
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC r, s
+ .set i, \r
+
+ .if (i < 40)
+ .set K_XMM, 32*0
+ .elseif (i < 80)
+ .set K_XMM, 32*1
+ .elseif (i < 120)
+ .set K_XMM, 32*2
+ .else
+ .set K_XMM, 32*3
+ .endif
+
+ .if (i<32)
+ PRECALC_00_15 \s
+ .elseif (i<64)
+ PRECALC_16_31 \s
+ .elseif (i < 160)
+ PRECALC_32_79 \s
+ .endif
+.endm
+
+.macro ROTATE_STATE
+ .set T_REG, E
+ .set E, D
+ .set D, C
+ .set C, B
+ .set B, TB
+ .set TB, A
+ .set A, T_REG
+
+ .set T_REG, RE
+ .set RE, RD
+ .set RD, RC
+ .set RC, RB
+ .set RB, RTB
+ .set RTB, RA
+ .set RA, T_REG
+.endm
+
+/* Macro relies on saved ROUND_Fx */
+
+.macro RND_FUN f, r
+ .if (\f == RND_F1)
+ ROUND_F1 \r
+ .elseif (\f == RND_F2)
+ ROUND_F2 \r
+ .elseif (\f == RND_F3)
+ ROUND_F3 \r
+ .endif
+.endm
+
+.macro RR r
+ .set round_id, (\r % 80)
+
+ .if (round_id == 0) /* Precalculate F for first round */
+ .set ROUND_FUNC, RND_F1
+ mov B, TB
+
+ rorx $(32-30), B, B /* b>>>2 */
+ andn D, TB, T1
+ and C, TB
+ xor T1, TB
+ .endif
+
+ RND_FUN ROUND_FUNC, \r
+ ROTATE_STATE
+
+ .if (round_id == 18)
+ .set ROUND_FUNC, RND_F2
+ .elseif (round_id == 38)
+ .set ROUND_FUNC, RND_F3
+ .elseif (round_id == 58)
+ .set ROUND_FUNC, RND_F2
+ .endif
+
+ .set round_id, ( (\r+1) % 80)
+
+ RND_FUN ROUND_FUNC, (\r+1)
+ ROTATE_STATE
+.endm
+
+.macro ROUND_F1 r
+ add WK(\r), E
+
+ andn C, A, T1 /* ~b&d */
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ rorx $(32-30),A, TB /* b>>>2 for next round */
+
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ /*
+ * Calculate F for the next round
+ * (b & c) ^ andn[b, d]
+ */
+ and B, A /* b&c */
+ xor T1, A /* F1 = (b&c) ^ (~b&d) */
+
+ lea (RE,RTA), E /* E += A >>> 5 */
+.endm
+
+.macro ROUND_F2 r
+ add WK(\r), E
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ /* Calculate F for the next round */
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ .if ((round_id) < 79)
+ rorx $(32-30), A, TB /* b>>>2 for next round */
+ .endif
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ .if ((round_id) < 79)
+ xor B, A
+ .endif
+
+ add TA, E /* E += A >>> 5 */
+
+ .if ((round_id) < 79)
+ xor C, A
+ .endif
+.endm
+
+.macro ROUND_F3 r
+ add WK(\r), E
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ mov B, T1
+ or A, T1
+
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ rorx $(32-30), A, TB /* b>>>2 for next round */
+
+ /* Calculate F for the next round
+ * (b and c) or (d and (b or c))
+ */
+ and C, T1
+ and B, A
+ or T1, A
+
+ add TA, E /* E += A >>> 5 */
+
+.endm
+
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+ mov \a, RTA
+ add $\d, RTA
+ cmp $\c, \b
+ cmovge RTA, \a
+.endm
+
+/*
+ * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+
+ REGALLOC
+
+ mov (HASH_PTR), A
+ mov 4(HASH_PTR), B
+ mov 8(HASH_PTR), C
+ mov 12(HASH_PTR), D
+ mov 16(HASH_PTR), E
+
+ mov %rsp, PRECALC_BUF
+ lea (2*4*80+32)(%rsp), WK_BUF
+
+ # Precalc WK for first 2 blocks
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
+ .set i, 0
+ .rept 160
+ PRECALC i
+ .set i, i + 1
+ .endr
+
+ /* Go to next block if needed */
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+ xchg WK_BUF, PRECALC_BUF
+
+ .align 32
+.L_loop:
+ /*
+ * code loops through more than one block
+ * we use K_BASE value as a signal of a last block,
+ * it is set below by: cmovae BUFFER_PTR, K_BASE
+ */
+ test BLOCKS_CTR, BLOCKS_CTR
+ jnz .L_begin
+ .align 32
+ jmp .L_end
+ .align 32
+.L_begin:
+
+ /*
+ * Do first block
+ * rounds: 0,2,4,6,8
+ */
+ .set j, 0
+ .rept 5
+ RR j
+ .set j, j+2
+ .endr
+
+ /*
+ * rounds:
+ * 10,12,14,16,18
+ * 20,22,24,26,28
+ * 30,32,34,36,38
+ * 40,42,44,46,48
+ * 50,52,54,56,58
+ */
+ .rept 25
+ RR j
+ .set j, j+2
+ .endr
+
+ /* Update Counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
+ /*
+ * rounds
+ * 60,62,64,66,68
+ * 70,72,74,76,78
+ */
+ .rept 10
+ RR j
+ .set j, j+2
+ .endr
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ test BLOCKS_CTR, BLOCKS_CTR
+ jz .L_loop
+
+ mov TB, B
+
+ /* Process second block */
+ /*
+ * rounds
+ * 0+80, 2+80, 4+80, 6+80, 8+80
+ * 10+80,12+80,14+80,16+80,18+80
+ */
+
+ .set j, 0
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ /*
+ * rounds
+ * 20+80,22+80,24+80,26+80,28+80
+ * 30+80,32+80,34+80,36+80,38+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ /*
+ * rounds
+ * 40+80,42+80,44+80,46+80,48+80
+ * 50+80,52+80,54+80,56+80,58+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ /* update counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+
+ /*
+ * rounds
+ * 60+80,62+80,64+80,66+80,68+80
+ * 70+80,72+80,74+80,76+80,78+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ /* Reset state for AVX2 reg permutation */
+ mov A, TA
+ mov TB, A
+ mov C, TB
+ mov E, C
+ mov D, B
+ mov TA, D
+
+ REGALLOC
+
+ xchg WK_BUF, PRECALC_BUF
+
+ jmp .L_loop
+
+ .align 32
+.L_end:
+
+.endm
+/*
+ * macro implements SHA-1 function's body for several 64-byte blocks
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM name
+ SYM_FUNC_START(\name)
+
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ RESERVE_STACK = (W_SIZE*4 + 8+24)
+
+ /* Align stack */
+ push %rbp
+ mov %rsp, %rbp
+ and $~(0x20-1), %rsp
+ sub $RESERVE_STACK, %rsp
+
+ avx2_zeroupper
+
+ /* Setup initial values */
+ mov CTX, HASH_PTR
+ mov BUF, BUFFER_PTR
+
+ mov BUF, BUFFER_PTR2
+ mov CNT, BLOCKS_CTR
+
+ xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
+
+ SHA1_PIPELINED_MAIN_BODY
+
+ avx2_zeroupper
+
+ mov %rbp, %rsp
+ pop %rbp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ RET
+
+ SYM_FUNC_END(\name)
+.endm
+
+.section .rodata
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.align 128
+K_XMM_AR:
+ .long K1, K1, K1, K1
+ .long K1, K1, K1, K1
+ .long K2, K2, K2, K2
+ .long K2, K2, K2, K2
+ .long K3, K3, K3, K3
+ .long K3, K3, K3, K3
+ .long K4, K4, K4, K4
+ .long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+.text
+
+SHA1_VECTOR_ASM sha1_transform_avx2
diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S
new file mode 100644
index 000000000000..428f9b960594
--- /dev/null
+++ b/lib/crypto/x86/sha1-ni-asm.S
@@ -0,0 +1,152 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define STATE_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define ABCD %xmm0
+#define E0 %xmm1 /* Need two E's b/c they ping pong */
+#define E1 %xmm2
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define SHUF_MASK %xmm7
+#define ABCD_SAVED %xmm8
+#define E0_SAVED %xmm9
+
+.macro do_4rounds i, m0, m1, m2, m3, e0, e1
+.if \i < 16
+ movdqu \i*4(DATA_PTR), \m0
+ pshufb SHUF_MASK, \m0
+.endif
+.if \i == 0
+ paddd \m0, \e0
+.else
+ sha1nexte \m0, \e0
+.endif
+ movdqa ABCD, \e1
+.if \i >= 12 && \i < 76
+ sha1msg2 \m0, \m1
+.endif
+ sha1rnds4 $\i / 20, \e0, ABCD
+.if \i >= 4 && \i < 68
+ sha1msg1 \m0, \m3
+.endif
+.if \i >= 8 && \i < 72
+ pxor \m0, \m2
+.endif
+.endm
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 block function
+ *
+ * This function takes a pointer to the current SHA-1 state, a pointer to the
+ * input data, and the number of 64-byte blocks to process. The number of
+ * blocks to process is assumed to be nonzero. Once all blocks have been
+ * processed, the state is updated with the new state. This function only
+ * processes complete blocks. State initialization, buffering of partial
+ * blocks, and digest finalization are expected to be handled elsewhere.
+ *
+ * void sha1_ni_transform(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks)
+ */
+.text
+SYM_FUNC_START(sha1_ni_transform)
+
+ /* Load the initial state from STATE_PTR. */
+ pxor E0, E0
+ pinsrd $3, 16(STATE_PTR), E0
+ movdqu (STATE_PTR), ABCD
+ pshufd $0x1B, ABCD, ABCD
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lnext_block:
+ /* Save the state for addition after the rounds. */
+ movdqa E0, E0_SAVED
+ movdqa ABCD, ABCD_SAVED
+
+.irp i, 0, 16, 32, 48, 64
+ do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3, E0, E1
+ do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0, E1, E0
+ do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1, E0, E1
+ do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2, E1, E0
+.endr
+
+ /* Add the previous state (before the rounds) to the current state. */
+ sha1nexte E0_SAVED, E0
+ paddd ABCD_SAVED, ABCD
+
+ /* Advance to the next block, or break if there are no more blocks. */
+ add $64, DATA_PTR
+ dec NUM_BLKS
+ jnz .Lnext_block
+
+ /* Store the new state to STATE_PTR. */
+ pextrd $3, E0, 16(STATE_PTR)
+ pshufd $0x1B, ABCD, ABCD
+ movdqu ABCD, (STATE_PTR)
+
+ RET
+SYM_FUNC_END(sha1_ni_transform)
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x000102030405060708090a0b0c0d0e0f
diff --git a/lib/crypto/x86/sha1-ssse3-and-avx.S b/lib/crypto/x86/sha1-ssse3-and-avx.S
new file mode 100644
index 000000000000..78b48cb09c5b
--- /dev/null
+++ b/lib/crypto/x86/sha1-ssse3-and-avx.S
@@ -0,0 +1,551 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
+ * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
+ * processors. CPUs supporting Intel(R) AVX extensions will get an additional
+ * boost.
+ *
+ * This work was inspired by the vectorized implementation of Dean Gaudet.
+ * Additional information on it can be found at:
+ * http://www.arctic.org/~dean/crypto/sha1.html
+ *
+ * It was improved upon with more efficient vectorization of the message
+ * scheduling. This implementation has also been optimized for all current and
+ * several future generations of Intel CPUs.
+ *
+ * See this article for more information about the implementation details:
+ * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * Copyright (C) 2010, Intel Corp.
+ * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ *
+ * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
+ * Author: Mathias Krause <minipli@googlemail.com>
+ */
+
+#include <linux/linkage.h>
+
+#define CTX %rdi // arg1
+#define BUF %rsi // arg2
+#define CNT %rdx // arg3
+
+#define REG_A %ecx
+#define REG_B %esi
+#define REG_C %edi
+#define REG_D %r12d
+#define REG_E %edx
+
+#define REG_T1 %eax
+#define REG_T2 %ebx
+
+#define K_BASE %r8
+#define HASH_PTR %r9
+#define BUFFER_PTR %r10
+#define BUFFER_END %r11
+
+#define W_TMP1 %xmm0
+#define W_TMP2 %xmm9
+
+#define W0 %xmm1
+#define W4 %xmm2
+#define W8 %xmm3
+#define W12 %xmm4
+#define W16 %xmm5
+#define W20 %xmm6
+#define W24 %xmm7
+#define W28 %xmm8
+
+#define XMM_SHUFB_BSWAP %xmm10
+
+/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
+#define WK(t) (((t) & 15) * 4)(%rsp)
+#define W_PRECALC_AHEAD 16
+
+/*
+ * This macro implements the SHA-1 function's body for single 64-byte block
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM name
+ SYM_FUNC_START(\name)
+
+ push %rbx
+ push %r12
+ push %rbp
+ mov %rsp, %rbp
+
+ sub $64, %rsp # allocate workspace
+ and $~15, %rsp # align stack
+
+ mov CTX, HASH_PTR
+ mov BUF, BUFFER_PTR
+
+ shl $6, CNT # multiply by 64
+ add BUF, CNT
+ mov CNT, BUFFER_END
+
+ lea K_XMM_AR(%rip), K_BASE
+ xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
+
+ SHA1_PIPELINED_MAIN_BODY
+
+ # cleanup workspace
+ mov $8, %ecx
+ mov %rsp, %rdi
+ xor %eax, %eax
+ rep stosq
+
+ mov %rbp, %rsp # deallocate workspace
+ pop %rbp
+ pop %r12
+ pop %rbx
+ RET
+
+ SYM_FUNC_END(\name)
+.endm
+
+/*
+ * This macro implements 80 rounds of SHA-1 for one 64-byte block
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+ INIT_REGALLOC
+
+ mov (HASH_PTR), A
+ mov 4(HASH_PTR), B
+ mov 8(HASH_PTR), C
+ mov 12(HASH_PTR), D
+ mov 16(HASH_PTR), E
+
+ .set i, 0
+ .rept W_PRECALC_AHEAD
+ W_PRECALC i
+ .set i, (i+1)
+ .endr
+
+.align 4
+1:
+ RR F1,A,B,C,D,E,0
+ RR F1,D,E,A,B,C,2
+ RR F1,B,C,D,E,A,4
+ RR F1,E,A,B,C,D,6
+ RR F1,C,D,E,A,B,8
+
+ RR F1,A,B,C,D,E,10
+ RR F1,D,E,A,B,C,12
+ RR F1,B,C,D,E,A,14
+ RR F1,E,A,B,C,D,16
+ RR F1,C,D,E,A,B,18
+
+ RR F2,A,B,C,D,E,20
+ RR F2,D,E,A,B,C,22
+ RR F2,B,C,D,E,A,24
+ RR F2,E,A,B,C,D,26
+ RR F2,C,D,E,A,B,28
+
+ RR F2,A,B,C,D,E,30
+ RR F2,D,E,A,B,C,32
+ RR F2,B,C,D,E,A,34
+ RR F2,E,A,B,C,D,36
+ RR F2,C,D,E,A,B,38
+
+ RR F3,A,B,C,D,E,40
+ RR F3,D,E,A,B,C,42
+ RR F3,B,C,D,E,A,44
+ RR F3,E,A,B,C,D,46
+ RR F3,C,D,E,A,B,48
+
+ RR F3,A,B,C,D,E,50
+ RR F3,D,E,A,B,C,52
+ RR F3,B,C,D,E,A,54
+ RR F3,E,A,B,C,D,56
+ RR F3,C,D,E,A,B,58
+
+ add $64, BUFFER_PTR # move to the next 64-byte block
+ cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
+ cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
+
+ RR F4,A,B,C,D,E,60
+ RR F4,D,E,A,B,C,62
+ RR F4,B,C,D,E,A,64
+ RR F4,E,A,B,C,D,66
+ RR F4,C,D,E,A,B,68
+
+ RR F4,A,B,C,D,E,70
+ RR F4,D,E,A,B,C,72
+ RR F4,B,C,D,E,A,74
+ RR F4,E,A,B,C,D,76
+ RR F4,C,D,E,A,B,78
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), B
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ RESTORE_RENAMED_REGS
+ cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
+ jne 1b
+.endm
+
+.macro INIT_REGALLOC
+ .set A, REG_A
+ .set B, REG_B
+ .set C, REG_C
+ .set D, REG_D
+ .set E, REG_E
+ .set T1, REG_T1
+ .set T2, REG_T2
+.endm
+
+.macro RESTORE_RENAMED_REGS
+ # order is important (REG_C is where it should be)
+ mov B, REG_B
+ mov D, REG_D
+ mov A, REG_A
+ mov E, REG_E
+.endm
+
+.macro SWAP_REG_NAMES a, b
+ .set _T, \a
+ .set \a, \b
+ .set \b, _T
+.endm
+
+.macro F1 b, c, d
+ mov \c, T1
+ SWAP_REG_NAMES \c, T1
+ xor \d, T1
+ and \b, T1
+ xor \d, T1
+.endm
+
+.macro F2 b, c, d
+ mov \d, T1
+ SWAP_REG_NAMES \d, T1
+ xor \c, T1
+ xor \b, T1
+.endm
+
+.macro F3 b, c ,d
+ mov \c, T1
+ SWAP_REG_NAMES \c, T1
+ mov \b, T2
+ or \b, T1
+ and \c, T2
+ and \d, T1
+ or T2, T1
+.endm
+
+.macro F4 b, c, d
+ F2 \b, \c, \d
+.endm
+
+.macro UPDATE_HASH hash, val
+ add \hash, \val
+ mov \val, \hash
+.endm
+
+/*
+ * RR does two rounds of SHA-1 back to back with W[] pre-calc
+ * t1 = F(b, c, d); e += w(i)
+ * e += t1; b <<= 30; d += w(i+1);
+ * t1 = F(a, b, c);
+ * d += t1; a <<= 5;
+ * e += a;
+ * t1 = e; a >>= 7;
+ * t1 <<= 5;
+ * d += t1;
+ */
+.macro RR F, a, b, c, d, e, round
+ add WK(\round), \e
+ \F \b, \c, \d # t1 = F(b, c, d);
+ W_PRECALC (\round + W_PRECALC_AHEAD)
+ rol $30, \b
+ add T1, \e
+ add WK(\round + 1), \d
+
+ \F \a, \b, \c
+ W_PRECALC (\round + W_PRECALC_AHEAD + 1)
+ rol $5, \a
+ add \a, \e
+ add T1, \d
+ ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
+
+ mov \e, T1
+ SWAP_REG_NAMES \e, T1
+
+ rol $5, T1
+ add T1, \d
+
+ # write: \a, \b
+ # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
+.endm
+
+.macro W_PRECALC r
+ .set i, \r
+
+ .if (i < 20)
+ .set K_XMM, 0
+ .elseif (i < 40)
+ .set K_XMM, 16
+ .elseif (i < 60)
+ .set K_XMM, 32
+ .elseif (i < 80)
+ .set K_XMM, 48
+ .endif
+
+ .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
+ .set i, ((\r) % 80) # pre-compute for the next iteration
+ .if (i == 0)
+ W_PRECALC_RESET
+ .endif
+ W_PRECALC_00_15
+ .elseif (i<32)
+ W_PRECALC_16_31
+ .elseif (i < 80) // rounds 32-79
+ W_PRECALC_32_79
+ .endif
+.endm
+
+.macro W_PRECALC_RESET
+ .set W, W0
+ .set W_minus_04, W4
+ .set W_minus_08, W8
+ .set W_minus_12, W12
+ .set W_minus_16, W16
+ .set W_minus_20, W20
+ .set W_minus_24, W24
+ .set W_minus_28, W28
+ .set W_minus_32, W
+.endm
+
+.macro W_PRECALC_ROTATE
+ .set W_minus_32, W_minus_28
+ .set W_minus_28, W_minus_24
+ .set W_minus_24, W_minus_20
+ .set W_minus_20, W_minus_16
+ .set W_minus_16, W_minus_12
+ .set W_minus_12, W_minus_08
+ .set W_minus_08, W_minus_04
+ .set W_minus_04, W
+ .set W, W_minus_32
+.endm
+
+.macro W_PRECALC_SSSE3
+
+.macro W_PRECALC_00_15
+ W_PRECALC_00_15_SSSE3
+.endm
+.macro W_PRECALC_16_31
+ W_PRECALC_16_31_SSSE3
+.endm
+.macro W_PRECALC_32_79
+ W_PRECALC_32_79_SSSE3
+.endm
+
+/* message scheduling pre-compute for rounds 0-15 */
+.macro W_PRECALC_00_15_SSSE3
+ .if ((i & 3) == 0)
+ movdqu (i*4)(BUFFER_PTR), W_TMP1
+ .elseif ((i & 3) == 1)
+ pshufb XMM_SHUFB_BSWAP, W_TMP1
+ movdqa W_TMP1, W
+ .elseif ((i & 3) == 2)
+ paddd (K_BASE), W_TMP1
+ .elseif ((i & 3) == 3)
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+/* message scheduling pre-compute for rounds 16-31
+ *
+ * - calculating last 32 w[i] values in 8 XMM registers
+ * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
+ * instruction
+ *
+ * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
+ * dependency, but improves for 32-79
+ */
+.macro W_PRECALC_16_31_SSSE3
+ # blended scheduling of vector and scalar instruction streams, one 4-wide
+ # vector iteration / 4 scalar rounds
+ .if ((i & 3) == 0)
+ movdqa W_minus_12, W
+ palignr $8, W_minus_16, W # w[i-14]
+ movdqa W_minus_04, W_TMP1
+ psrldq $4, W_TMP1 # w[i-3]
+ pxor W_minus_08, W
+ .elseif ((i & 3) == 1)
+ pxor W_minus_16, W_TMP1
+ pxor W_TMP1, W
+ movdqa W, W_TMP2
+ movdqa W, W_TMP1
+ pslldq $12, W_TMP2
+ .elseif ((i & 3) == 2)
+ psrld $31, W
+ pslld $1, W_TMP1
+ por W, W_TMP1
+ movdqa W_TMP2, W
+ psrld $30, W_TMP2
+ pslld $2, W
+ .elseif ((i & 3) == 3)
+ pxor W, W_TMP1
+ pxor W_TMP2, W_TMP1
+ movdqa W_TMP1, W
+ paddd K_XMM(K_BASE), W_TMP1
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+/* message scheduling pre-compute for rounds 32-79
+ *
+ * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ */
+.macro W_PRECALC_32_79_SSSE3
+ .if ((i & 3) == 0)
+ movdqa W_minus_04, W_TMP1
+ pxor W_minus_28, W # W is W_minus_32 before xor
+ palignr $8, W_minus_08, W_TMP1
+ .elseif ((i & 3) == 1)
+ pxor W_minus_16, W
+ pxor W_TMP1, W
+ movdqa W, W_TMP1
+ .elseif ((i & 3) == 2)
+ psrld $30, W
+ pslld $2, W_TMP1
+ por W, W_TMP1
+ .elseif ((i & 3) == 3)
+ movdqa W_TMP1, W
+ paddd K_XMM(K_BASE), W_TMP1
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.endm // W_PRECALC_SSSE3
+
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.section .rodata
+.align 16
+
+K_XMM_AR:
+ .long K1, K1, K1, K1
+ .long K2, K2, K2, K2
+ .long K3, K3, K3, K3
+ .long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+
+
+.section .text
+
+W_PRECALC_SSSE3
+.macro xmm_mov a, b
+ movdqu \a,\b
+.endm
+
+/*
+ * SSSE3 optimized implementation:
+ *
+ * void sha1_transform_ssse3(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+SHA1_VECTOR_ASM sha1_transform_ssse3
+
+.macro W_PRECALC_AVX
+
+.purgem W_PRECALC_00_15
+.macro W_PRECALC_00_15
+ W_PRECALC_00_15_AVX
+.endm
+.purgem W_PRECALC_16_31
+.macro W_PRECALC_16_31
+ W_PRECALC_16_31_AVX
+.endm
+.purgem W_PRECALC_32_79
+.macro W_PRECALC_32_79
+ W_PRECALC_32_79_AVX
+.endm
+
+.macro W_PRECALC_00_15_AVX
+ .if ((i & 3) == 0)
+ vmovdqu (i*4)(BUFFER_PTR), W_TMP1
+ .elseif ((i & 3) == 1)
+ vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
+ .elseif ((i & 3) == 2)
+ vpaddd (K_BASE), W, W_TMP1
+ .elseif ((i & 3) == 3)
+ vmovdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.macro W_PRECALC_16_31_AVX
+ .if ((i & 3) == 0)
+ vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
+ vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
+ vpxor W_minus_08, W, W
+ vpxor W_minus_16, W_TMP1, W_TMP1
+ .elseif ((i & 3) == 1)
+ vpxor W_TMP1, W, W
+ vpslldq $12, W, W_TMP2
+ vpslld $1, W, W_TMP1
+ .elseif ((i & 3) == 2)
+ vpsrld $31, W, W
+ vpor W, W_TMP1, W_TMP1
+ vpslld $2, W_TMP2, W
+ vpsrld $30, W_TMP2, W_TMP2
+ .elseif ((i & 3) == 3)
+ vpxor W, W_TMP1, W_TMP1
+ vpxor W_TMP2, W_TMP1, W
+ vpaddd K_XMM(K_BASE), W, W_TMP1
+ vmovdqu W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.macro W_PRECALC_32_79_AVX
+ .if ((i & 3) == 0)
+ vpalignr $8, W_minus_08, W_minus_04, W_TMP1
+ vpxor W_minus_28, W, W # W is W_minus_32 before xor
+ .elseif ((i & 3) == 1)
+ vpxor W_minus_16, W_TMP1, W_TMP1
+ vpxor W_TMP1, W, W
+ .elseif ((i & 3) == 2)
+ vpslld $2, W, W_TMP1
+ vpsrld $30, W, W
+ vpor W, W_TMP1, W
+ .elseif ((i & 3) == 3)
+ vpaddd K_XMM(K_BASE), W, W_TMP1
+ vmovdqu W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.endm // W_PRECALC_AVX
+
+W_PRECALC_AVX
+.purgem xmm_mov
+.macro xmm_mov a, b
+ vmovdqu \a,\b
+.endm
+
+
+/* AVX optimized implementation:
+ * void sha1_transform_avx(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+SHA1_VECTOR_ASM sha1_transform_avx
diff --git a/lib/crypto/x86/sha1.h b/lib/crypto/x86/sha1.h
new file mode 100644
index 000000000000..c48a0131fd12
--- /dev/null
+++ b/lib/crypto/x86/sha1.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+DEFINE_STATIC_CALL(sha1_blocks_x86, sha1_blocks_generic);
+
+#define DEFINE_X86_SHA1_FN(c_fn, asm_fn) \
+ asmlinkage void asm_fn(struct sha1_block_state *state, \
+ const u8 *data, size_t nblocks); \
+ static void c_fn(struct sha1_block_state *state, \
+ const u8 *data, size_t nblocks) \
+ { \
+ if (likely(irq_fpu_usable())) { \
+ kernel_fpu_begin(); \
+ asm_fn(state, data, nblocks); \
+ kernel_fpu_end(); \
+ } else { \
+ sha1_blocks_generic(state, data, nblocks); \
+ } \
+ }
+
+DEFINE_X86_SHA1_FN(sha1_blocks_ssse3, sha1_transform_ssse3);
+DEFINE_X86_SHA1_FN(sha1_blocks_avx, sha1_transform_avx);
+DEFINE_X86_SHA1_FN(sha1_blocks_ni, sha1_ni_transform);
+
+#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
+
+asmlinkage void sha1_transform_avx2(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+static void sha1_blocks_avx2(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (likely(irq_fpu_usable())) {
+ kernel_fpu_begin();
+ /* Select the optimal transform based on the number of blocks */
+ if (nblocks >= SHA1_AVX2_BLOCK_OPTSIZE)
+ sha1_transform_avx2(state, data, nblocks);
+ else
+ sha1_transform_avx(state, data, nblocks);
+ kernel_fpu_end();
+ } else {
+ sha1_blocks_generic(state, data, nblocks);
+ }
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ static_call(sha1_blocks_x86)(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+ static_call_update(sha1_blocks_x86, sha1_blocks_ni);
+ } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ NULL) &&
+ boot_cpu_has(X86_FEATURE_AVX)) {
+ if (boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI1) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ static_call_update(sha1_blocks_x86, sha1_blocks_avx2);
+ else
+ static_call_update(sha1_blocks_x86, sha1_blocks_avx);
+ } else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+ static_call_update(sha1_blocks_x86, sha1_blocks_ssse3);
+ }
+}
diff --git a/lib/crypto/x86/sha256-avx-asm.S b/lib/crypto/x86/sha256-avx-asm.S
new file mode 100644
index 000000000000..c1aceb3ba3a3
--- /dev/null
+++ b/lib/crypto/x86/sha256-avx-asm.S
@@ -0,0 +1,493 @@
+########################################################################
+# Implement fast SHA-256 with AVX1 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 block at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+
+.macro MY_ROR p1 p2
+ shld $(32-(\p1)), \p2, \p2
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+ VMOVDQ \p2, \p1
+ vpshufb \p3, \p1, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+XTMP5 = %xmm11
+
+SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm13
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+
+SRND = %rsi # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP = _INP_END + _INP_END_SIZE
+_XFER = _INP + _INP_SIZE
+_XMM_SAVE = _XFER + _XFER_SIZE
+STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+ ## compute s0 four at a time and s1 two at a time
+ ## compute W[-16] + W[-7] 4 at a time
+
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ## compute s0
+ vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpsrld $7, XTMP1, XTMP2
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpslld $(32-7), XTMP1, XTMP3
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ vpsrld $18, XTMP1, XTMP2 #
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpslld $(32-18), XTMP1, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ vpxor XTMP1, XTMP3, XTMP3 #
+ add y0, y2 # y2 = S1 + CH
+ add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ ## compute low s1
+ vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+ xor g, y2 # y2 = f^g
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpxor XTMP3, XTMP2, XTMP2 #
+ add y0, y2 # y2 = S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ ## compute high s1
+ vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ vpxor XTMP3, XTMP2, XTMP2
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and e, y2 # y2 = (f^g)&e
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ add y0, y2 # y2 = S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ offset = \round * 4 + _XFER #
+ add offset(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_avx(struct sha256_block_state *state,
+## const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_avx)
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp # allocate stack space
+ and $~15, %rsp # align stack pointer
+
+ shl $6, NUM_BLKS # convert to bytes
+ add INP, NUM_BLKS # pointer to end of data
+ mov NUM_BLKS, _INP_END(%rsp)
+
+ ## load initial digest
+ mov 4*0(CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+.Lloop0:
+ lea K256(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
+
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov $3, SRND
+.align 16
+.Lloop1:
+ vpaddd (TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 1*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 2*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 3*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ add $4*16, TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ sub $1, SRND
+ jne .Lloop1
+
+ mov $2, SRND
+.Lloop2:
+ vpaddd (TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vpaddd 1*16(TBL), X1, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ add $2*16, TBL
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vmovdqa X2, X0
+ vmovdqa X3, X1
+
+ sub $1, SRND
+ jne .Lloop2
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ mov _INP(%rsp), INP
+ add $64, INP
+ cmp _INP_END(%rsp), INP
+ jne .Lloop0
+
+ mov %rbp, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ RET
+SYM_FUNC_END(sha256_transform_avx)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256-avx2-asm.S b/lib/crypto/x86/sha256-avx2-asm.S
new file mode 100644
index 000000000000..eb8836fb9695
--- /dev/null
+++ b/lib/crypto/x86/sha256-avx2-asm.S
@@ -0,0 +1,770 @@
+########################################################################
+# Implement fast SHA-256 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 2 blocks at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+################################
+
+X0 = %ymm4
+X1 = %ymm5
+X2 = %ymm6
+X3 = %ymm7
+
+# XMM versions of above
+XWORD0 = %xmm4
+XWORD1 = %xmm5
+XWORD2 = %xmm6
+XWORD3 = %xmm7
+
+XTMP0 = %ymm0
+XTMP1 = %ymm1
+XTMP2 = %ymm2
+XTMP3 = %ymm3
+XTMP4 = %ymm8
+XFER = %ymm9
+XTMP5 = %ymm11
+
+SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %ymm13
+
+X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+c = %ecx
+d = %r8d
+e = %edx # clobbers NUM_BLKS
+y3 = %esi # clobbers INP
+
+SRND = CTX # SRND is same register as CTX
+
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+old_h = %r11d
+
+T1 = %r12d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
+_XMM_SAVE_SIZE = 0
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_CTX_SIZE = 8
+
+_XFER = 0
+_XMM_SAVE = _XFER + _XFER_SIZE
+_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
+_INP = _INP_END + _INP_END_SIZE
+_CTX = _INP + _INP_SIZE
+STACK_SIZE = _CTX + _CTX_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+ X_ = X0
+ X0 = X1
+ X1 = X2
+ X2 = X3
+ X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+ old_h = h
+ TMP_ = h
+ h = g
+ g = f
+ f = e
+ e = d
+ d = c
+ c = b
+ b = a
+ a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED disp
+################################### RND N + 0 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+
+ addl \disp(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+ vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+ vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+
+ and e, y2 # y2 = (f^g)&e # CH
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ vpsrld $7, XTMP1, XTMP2
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+
+ add y0, y2 # y2 = S1 + CH # --
+ vpslld $(32-7), XTMP1, XTMP3
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
+
+ vpsrld $18, XTMP1, XTMP2
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+
+ ROTATE_ARGS
+
+################################### RND N + 1 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ offset = \disp + 1*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+
+ vpslld $(32-18), XTMP1, XTMP1
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+
+ vpxor XTMP1, XTMP3, XTMP3
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
+ vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+
+
+ ROTATE_ARGS
+
+################################### RND N + 2 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ offset = \disp + 2*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ xor g, y2 # y2 = f^g # CH
+
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
+ and e, y2 # y2 = (f^g)&e # CH
+
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ vpxor XTMP3, XTMP2, XTMP2
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a ,T1 # T1 = (a >> 2) # S0
+ vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+ vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1,h # h = k + w + h + S0 # --
+ add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3,h # h = t1 + S0 + MAJ # --
+
+
+ ROTATE_ARGS
+
+################################### RND N + 3 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ offset = \disp + 3*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpxor XTMP3, XTMP2, XTMP2
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ add y0, y2 # y2 = S1 + CH # --
+
+ vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+
+ vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+
+ add y1, h # h = k + w + h + S0 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+.macro DO_4ROUNDS disp
+################################### RND N + 0 ###########################
+
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ addl \disp(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 1 ###########################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*1 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 2 ##############################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*2 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 3 ###########################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*3 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ ROTATE_ARGS
+
+.endm
+
+########################################################################
+## void sha256_transform_rorx(struct sha256_block_state *state,
+## const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_rorx)
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ push %rbp
+ mov %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp
+ and $-32, %rsp # align rsp to 32 byte boundary
+
+ shl $6, NUM_BLKS # convert to bytes
+ lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
+ mov NUM_BLKS, _INP_END(%rsp)
+
+ cmp NUM_BLKS, INP
+ je .Lonly_one_block
+
+ ## load initial digest
+ mov (CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+
+ mov CTX, _CTX(%rsp)
+
+.Lloop0:
+ ## Load first 16 dwords from two blocks
+ VMOVDQ 0*32(INP),XTMP0
+ VMOVDQ 1*32(INP),XTMP1
+ VMOVDQ 2*32(INP),XTMP2
+ VMOVDQ 3*32(INP),XTMP3
+
+ ## byte swap data
+ vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
+ vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
+ vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
+ vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
+
+ ## transpose data into high/low halves
+ vperm2i128 $0x20, XTMP2, XTMP0, X0
+ vperm2i128 $0x31, XTMP2, XTMP0, X1
+ vperm2i128 $0x20, XTMP3, XTMP1, X2
+ vperm2i128 $0x31, XTMP3, XTMP1, X3
+
+.Llast_block_enter:
+ add $64, INP
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 12 each
+ xor SRND, SRND
+
+.align 16
+.Lloop1:
+ leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
+
+ leaq K256+1*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
+
+ leaq K256+2*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
+
+ leaq K256+3*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
+
+ add $4*32, SRND
+ cmp $3*4*32, SRND
+ jb .Lloop1
+
+.Lloop2:
+ ## Do last 16 rounds with no scheduling
+ leaq K256+0*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+ DO_4ROUNDS (_XFER + 0*32)
+
+ leaq K256+1*32(%rip), INP
+ vpaddd (INP, SRND), X1, XFER
+ vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+ DO_4ROUNDS (_XFER + 1*32)
+ add $2*32, SRND
+
+ vmovdqa X2, X0
+ vmovdqa X3, X1
+
+ cmp $4*4*32, SRND
+ jb .Lloop2
+
+ mov _CTX(%rsp), CTX
+ mov _INP(%rsp), INP
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ cmp _INP_END(%rsp), INP
+ ja .Ldone_hash
+
+ #### Do second block using previously scheduled results
+ xor SRND, SRND
+.align 16
+.Lloop3:
+ DO_4ROUNDS (_XFER + 0*32 + 16)
+ DO_4ROUNDS (_XFER + 1*32 + 16)
+ add $2*32, SRND
+ cmp $4*4*32, SRND
+ jb .Lloop3
+
+ mov _CTX(%rsp), CTX
+ mov _INP(%rsp), INP
+ add $64, INP
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ cmp _INP_END(%rsp), INP
+ jb .Lloop0
+ ja .Ldone_hash
+
+.Ldo_last_block:
+ VMOVDQ 0*16(INP),XWORD0
+ VMOVDQ 1*16(INP),XWORD1
+ VMOVDQ 2*16(INP),XWORD2
+ VMOVDQ 3*16(INP),XWORD3
+
+ vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
+ vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
+ vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
+ vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
+
+ jmp .Llast_block_enter
+
+.Lonly_one_block:
+
+ ## load initial digest
+ mov (4*0)(CTX),a
+ mov (4*1)(CTX),b
+ mov (4*2)(CTX),c
+ mov (4*3)(CTX),d
+ mov (4*4)(CTX),e
+ mov (4*5)(CTX),f
+ mov (4*6)(CTX),g
+ mov (4*7)(CTX),h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+
+ mov CTX, _CTX(%rsp)
+ jmp .Ldo_last_block
+
+.Ldone_hash:
+
+ mov %rbp, %rsp
+ pop %rbp
+
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ vzeroupper
+ RET
+SYM_FUNC_END(sha256_transform_rorx)
+
+.section .rodata.cst512.K256, "aM", @progbits, 512
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
+.align 32
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
+.align 32
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S
new file mode 100644
index 000000000000..de5f707e7ef7
--- /dev/null
+++ b/lib/crypto/x86/sha256-ni-asm.S
@@ -0,0 +1,559 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define STATE_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define SHA256CONSTANTS %rax
+
+#define MSG %xmm0 /* sha256rnds2 implicit operand */
+#define STATE0 %xmm1
+#define STATE1 %xmm2
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define TMP %xmm7
+
+#define SHUF_MASK %xmm8
+
+#define ABEF_SAVE %xmm9
+#define CDGH_SAVE %xmm10
+
+.macro do_4rounds i, m0, m1, m2, m3
+.if \i < 16
+ movdqu \i*4(DATA_PTR), \m0
+ pshufb SHUF_MASK, \m0
+.endif
+ movdqa (\i-32)*4(SHA256CONSTANTS), MSG
+ paddd \m0, MSG
+ sha256rnds2 STATE0, STATE1
+.if \i >= 12 && \i < 60
+ movdqa \m0, TMP
+ palignr $4, \m3, TMP
+ paddd TMP, \m1
+ sha256msg2 \m0, \m1
+.endif
+ punpckhqdq MSG, MSG
+ sha256rnds2 STATE1, STATE0
+.if \i >= 4 && \i < 52
+ sha256msg1 \m0, \m3
+.endif
+.endm
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 block function
+ *
+ * This function takes a pointer to the current SHA-256 state, a pointer to the
+ * input data, and the number of 64-byte blocks to process. Once all blocks
+ * have been processed, the state is updated with the new state. This function
+ * only processes complete blocks. State initialization, buffering of partial
+ * blocks, and digest finalization is expected to be handled elsewhere.
+ *
+ * void sha256_ni_transform(struct sha256_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+.text
+SYM_FUNC_START(sha256_ni_transform)
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /*
+ * load initial hash values
+ * Need to reorder these appropriately
+ * DCBA, HGFE -> ABEF, CDGH
+ */
+ movdqu 0*16(STATE_PTR), STATE0 /* DCBA */
+ movdqu 1*16(STATE_PTR), STATE1 /* HGFE */
+
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* FEBA */
+ punpckhqdq TMP, STATE1 /* DCHG */
+ pshufd $0x1B, STATE0, STATE0 /* ABEF */
+ pshufd $0xB1, STATE1, STATE1 /* CDGH */
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+ lea K256+32*4(%rip), SHA256CONSTANTS
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa STATE0, ABEF_SAVE
+ movdqa STATE1, CDGH_SAVE
+
+.irp i, 0, 16, 32, 48
+ do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
+ do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
+ do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
+ do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
+.endr
+
+ /* Add current hash values with previously saved */
+ paddd ABEF_SAVE, STATE0
+ paddd CDGH_SAVE, STATE1
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* GHEF */
+ punpckhqdq TMP, STATE1 /* ABCD */
+ pshufd $0xB1, STATE0, STATE0 /* HGFE */
+ pshufd $0x1B, STATE1, STATE1 /* DCBA */
+
+ movdqu STATE1, 0*16(STATE_PTR)
+ movdqu STATE0, 1*16(STATE_PTR)
+
+ RET
+SYM_FUNC_END(sha256_ni_transform)
+
+#undef DIGEST_PTR
+#undef DATA_PTR
+#undef NUM_BLKS
+#undef SHA256CONSTANTS
+#undef MSG
+#undef STATE0
+#undef STATE1
+#undef MSG0
+#undef MSG1
+#undef MSG2
+#undef MSG3
+#undef TMP
+#undef SHUF_MASK
+#undef ABEF_SAVE
+#undef CDGH_SAVE
+
+// parameters for sha256_ni_finup2x()
+#define CTX %rdi
+#define DATA1 %rsi
+#define DATA2 %rdx
+#define LEN %ecx
+#define LEN8 %cl
+#define LEN64 %rcx
+#define OUT1 %r8
+#define OUT2 %r9
+
+// other scalar variables
+#define SHA256CONSTANTS %rax
+#define COUNT %r10
+#define COUNT32 %r10d
+#define FINAL_STEP %r11d
+
+// rbx is used as a temporary.
+
+#define MSG %xmm0 // sha256rnds2 implicit operand
+#define STATE0_A %xmm1
+#define STATE1_A %xmm2
+#define STATE0_B %xmm3
+#define STATE1_B %xmm4
+#define TMP_A %xmm5
+#define TMP_B %xmm6
+#define MSG0_A %xmm7
+#define MSG1_A %xmm8
+#define MSG2_A %xmm9
+#define MSG3_A %xmm10
+#define MSG0_B %xmm11
+#define MSG1_B %xmm12
+#define MSG2_B %xmm13
+#define MSG3_B %xmm14
+#define SHUF_MASK %xmm15
+
+#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state)
+#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
+
+// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b
+// contain the current 4 message schedule words for the first and second message
+// respectively.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words for each message. m1_a-m3_a contain
+// the next 3 groups of 4 message schedule words for the first message, and
+// likewise m1_b-m3_b for the second. After consuming the current value of
+// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
+// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the
+// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
+// cycle through the registers accordingly.
+.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b
+ movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A
+ movdqa TMP_A, TMP_B
+ paddd \m0_a, TMP_A
+ paddd \m0_b, TMP_B
+.if \i < 48
+ sha256msg1 \m1_a, \m0_a
+ sha256msg1 \m1_b, \m0_b
+.endif
+ movdqa TMP_A, MSG
+ sha256rnds2 STATE0_A, STATE1_A
+ movdqa TMP_B, MSG
+ sha256rnds2 STATE0_B, STATE1_B
+ pshufd $0x0E, TMP_A, MSG
+ sha256rnds2 STATE1_A, STATE0_A
+ pshufd $0x0E, TMP_B, MSG
+ sha256rnds2 STATE1_B, STATE0_B
+.if \i < 48
+ movdqa \m3_a, TMP_A
+ movdqa \m3_b, TMP_B
+ palignr $4, \m2_a, TMP_A
+ palignr $4, \m2_b, TMP_B
+ paddd TMP_A, \m0_a
+ paddd TMP_B, \m0_b
+ sha256msg2 \m3_a, \m0_a
+ sha256msg2 \m3_b, \m0_b
+.endif
+.endm
+
+//
+// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+// const u8 *data1, const u8 *data2, int len,
+// u8 out1[SHA256_DIGEST_SIZE],
+// u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved. On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ni_finup2x)
+ // Allocate 128 bytes of stack space, 16-byte aligned.
+ push %rbx
+ push %rbp
+ mov %rsp, %rbp
+ sub $128, %rsp
+ and $~15, %rsp
+
+ // Load the shuffle mask for swapping the endianness of 32-bit words.
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+ // Set up pointer to the round constants.
+ lea K256+32*4(%rip), SHA256CONSTANTS
+
+ // Initially we're not processing the final blocks.
+ xor FINAL_STEP, FINAL_STEP
+
+ // Load the initial state from ctx->state.
+ movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA
+ movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE
+ movdqa STATE0_A, TMP_A
+ punpcklqdq STATE1_A, STATE0_A // FEBA
+ punpckhqdq TMP_A, STATE1_A // DCHG
+ pshufd $0x1B, STATE0_A, STATE0_A // ABEF
+ pshufd $0xB1, STATE1_A, STATE1_A // CDGH
+
+ // Load ctx->bytecount. Take the mod 64 of it to get the number of
+ // bytes that are buffered in ctx->buf. Also save it in a register with
+ // LEN added to it.
+ mov LEN, LEN
+ mov OFFSETOF_BYTECOUNT(CTX), %rbx
+ lea (%rbx, LEN64, 1), COUNT
+ and $63, %ebx
+ jz .Lfinup2x_enter_loop // No bytes buffered?
+
+ // %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them
+ // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we
+ // just load 64 bytes from each of ctx->buf, DATA1, and DATA2
+ // unconditionally and rearrange the data as needed.
+
+ movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A
+ movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A
+ movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A
+ movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A
+ movdqa MSG0_A, 0*16(%rsp)
+ movdqa MSG1_A, 1*16(%rsp)
+ movdqa MSG2_A, 2*16(%rsp)
+ movdqa MSG3_A, 3*16(%rsp)
+
+ movdqu 0*16(DATA1), MSG0_A
+ movdqu 1*16(DATA1), MSG1_A
+ movdqu 2*16(DATA1), MSG2_A
+ movdqu 3*16(DATA1), MSG3_A
+ movdqu MSG0_A, 0*16(%rsp,%rbx)
+ movdqu MSG1_A, 1*16(%rsp,%rbx)
+ movdqu MSG2_A, 2*16(%rsp,%rbx)
+ movdqu MSG3_A, 3*16(%rsp,%rbx)
+ movdqa 0*16(%rsp), MSG0_A
+ movdqa 1*16(%rsp), MSG1_A
+ movdqa 2*16(%rsp), MSG2_A
+ movdqa 3*16(%rsp), MSG3_A
+
+ movdqu 0*16(DATA2), MSG0_B
+ movdqu 1*16(DATA2), MSG1_B
+ movdqu 2*16(DATA2), MSG2_B
+ movdqu 3*16(DATA2), MSG3_B
+ movdqu MSG0_B, 0*16(%rsp,%rbx)
+ movdqu MSG1_B, 1*16(%rsp,%rbx)
+ movdqu MSG2_B, 2*16(%rsp,%rbx)
+ movdqu MSG3_B, 3*16(%rsp,%rbx)
+ movdqa 0*16(%rsp), MSG0_B
+ movdqa 1*16(%rsp), MSG1_B
+ movdqa 2*16(%rsp), MSG2_B
+ movdqa 3*16(%rsp), MSG3_B
+
+ sub $64, %rbx // rbx = buffered - 64
+ sub %rbx, DATA1 // DATA1 += 64 - buffered
+ sub %rbx, DATA2 // DATA2 += 64 - buffered
+ add %ebx, LEN // LEN += buffered - 64
+ movdqa STATE0_A, STATE0_B
+ movdqa STATE1_A, STATE1_B
+ jmp .Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+ sub $64, LEN
+ movdqa STATE0_A, STATE0_B
+ movdqa STATE1_A, STATE1_B
+.Lfinup2x_loop:
+ // Load the next two data blocks.
+ movdqu 0*16(DATA1), MSG0_A
+ movdqu 0*16(DATA2), MSG0_B
+ movdqu 1*16(DATA1), MSG1_A
+ movdqu 1*16(DATA2), MSG1_B
+ movdqu 2*16(DATA1), MSG2_A
+ movdqu 2*16(DATA2), MSG2_B
+ movdqu 3*16(DATA1), MSG3_A
+ movdqu 3*16(DATA2), MSG3_B
+ add $64, DATA1
+ add $64, DATA2
+.Lfinup2x_loop_have_data:
+ // Convert the words of the data blocks from big endian.
+ pshufb SHUF_MASK, MSG0_A
+ pshufb SHUF_MASK, MSG0_B
+ pshufb SHUF_MASK, MSG1_A
+ pshufb SHUF_MASK, MSG1_B
+ pshufb SHUF_MASK, MSG2_A
+ pshufb SHUF_MASK, MSG2_B
+ pshufb SHUF_MASK, MSG3_A
+ pshufb SHUF_MASK, MSG3_B
+.Lfinup2x_loop_have_bswapped_data:
+
+ // Save the original state for each block.
+ movdqa STATE0_A, 0*16(%rsp)
+ movdqa STATE0_B, 1*16(%rsp)
+ movdqa STATE1_A, 2*16(%rsp)
+ movdqa STATE1_B, 3*16(%rsp)
+
+ // Do the SHA-256 rounds on each block.
+.irp i, 0, 16, 32, 48
+ do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
+ MSG0_B, MSG1_B, MSG2_B, MSG3_B
+ do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
+ MSG1_B, MSG2_B, MSG3_B, MSG0_B
+ do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
+ MSG2_B, MSG3_B, MSG0_B, MSG1_B
+ do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
+ MSG3_B, MSG0_B, MSG1_B, MSG2_B
+.endr
+
+ // Add the original state for each block.
+ paddd 0*16(%rsp), STATE0_A
+ paddd 1*16(%rsp), STATE0_B
+ paddd 2*16(%rsp), STATE1_A
+ paddd 3*16(%rsp), STATE1_B
+
+ // Update LEN and loop back if more blocks remain.
+ sub $64, LEN
+ jge .Lfinup2x_loop
+
+ // Check if any final blocks need to be handled.
+ // FINAL_STEP = 2: all done
+ // FINAL_STEP = 1: need to do count-only padding block
+ // FINAL_STEP = 0: need to do the block with 0x80 padding byte
+ cmp $1, FINAL_STEP
+ jg .Lfinup2x_done
+ je .Lfinup2x_finalize_countonly
+ add $64, LEN
+ jz .Lfinup2x_finalize_blockaligned
+
+ // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block.
+ // To do this, write the padding starting with the 0x80 byte to
+ // &sp[64]. Then for each message, copy the last 64 data bytes to sp
+ // and load from &sp[64 - LEN] to get the needed padding block. This
+ // code relies on the data buffers being >= 64 bytes in length.
+ mov $64, %ebx
+ sub LEN, %ebx // ebx = 64 - LEN
+ sub %rbx, DATA1 // DATA1 -= 64 - LEN
+ sub %rbx, DATA2 // DATA2 -= 64 - LEN
+ mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary
+ movd FINAL_STEP, MSG0_A
+ pxor MSG1_A, MSG1_A
+ movdqa MSG0_A, 4*16(%rsp)
+ movdqa MSG1_A, 5*16(%rsp)
+ movdqa MSG1_A, 6*16(%rsp)
+ movdqa MSG1_A, 7*16(%rsp)
+ cmp $56, LEN
+ jge 1f // will COUNT spill into its own block?
+ shl $3, COUNT
+ bswap COUNT
+ mov COUNT, 56(%rsp,%rbx)
+ mov $2, FINAL_STEP // won't need count-only block
+ jmp 2f
+1:
+ mov $1, FINAL_STEP // will need count-only block
+2:
+ movdqu 0*16(DATA1), MSG0_A
+ movdqu 1*16(DATA1), MSG1_A
+ movdqu 2*16(DATA1), MSG2_A
+ movdqu 3*16(DATA1), MSG3_A
+ movdqa MSG0_A, 0*16(%rsp)
+ movdqa MSG1_A, 1*16(%rsp)
+ movdqa MSG2_A, 2*16(%rsp)
+ movdqa MSG3_A, 3*16(%rsp)
+ movdqu 0*16(%rsp,%rbx), MSG0_A
+ movdqu 1*16(%rsp,%rbx), MSG1_A
+ movdqu 2*16(%rsp,%rbx), MSG2_A
+ movdqu 3*16(%rsp,%rbx), MSG3_A
+
+ movdqu 0*16(DATA2), MSG0_B
+ movdqu 1*16(DATA2), MSG1_B
+ movdqu 2*16(DATA2), MSG2_B
+ movdqu 3*16(DATA2), MSG3_B
+ movdqa MSG0_B, 0*16(%rsp)
+ movdqa MSG1_B, 1*16(%rsp)
+ movdqa MSG2_B, 2*16(%rsp)
+ movdqa MSG3_B, 3*16(%rsp)
+ movdqu 0*16(%rsp,%rbx), MSG0_B
+ movdqu 1*16(%rsp,%rbx), MSG1_B
+ movdqu 2*16(%rsp,%rbx), MSG2_B
+ movdqu 3*16(%rsp,%rbx), MSG3_B
+ jmp .Lfinup2x_loop_have_data
+
+ // Prepare a padding block, either:
+ //
+ // {0x80, 0, 0, 0, ..., count (as __be64)}
+ // This is for a block aligned message.
+ //
+ // { 0, 0, 0, 0, ..., count (as __be64)}
+ // This is for a message whose length mod 64 is >= 56.
+ //
+ // Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+ pxor MSG0_A, MSG0_A
+ jmp 1f
+
+.Lfinup2x_finalize_blockaligned:
+ mov $0x80000000, %ebx
+ movd %ebx, MSG0_A
+1:
+ pxor MSG1_A, MSG1_A
+ pxor MSG2_A, MSG2_A
+ ror $29, COUNT
+ movq COUNT, MSG3_A
+ pslldq $8, MSG3_A
+ movdqa MSG0_A, MSG0_B
+ pxor MSG1_B, MSG1_B
+ pxor MSG2_B, MSG2_B
+ movdqa MSG3_A, MSG3_B
+ mov $2, FINAL_STEP
+ jmp .Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+ // Write the two digests with all bytes in the correct order.
+ movdqa STATE0_A, TMP_A
+ movdqa STATE0_B, TMP_B
+ punpcklqdq STATE1_A, STATE0_A // GHEF
+ punpcklqdq STATE1_B, STATE0_B
+ punpckhqdq TMP_A, STATE1_A // ABCD
+ punpckhqdq TMP_B, STATE1_B
+ pshufd $0xB1, STATE0_A, STATE0_A // HGFE
+ pshufd $0xB1, STATE0_B, STATE0_B
+ pshufd $0x1B, STATE1_A, STATE1_A // DCBA
+ pshufd $0x1B, STATE1_B, STATE1_B
+ pshufb SHUF_MASK, STATE0_A
+ pshufb SHUF_MASK, STATE0_B
+ pshufb SHUF_MASK, STATE1_A
+ pshufb SHUF_MASK, STATE1_B
+ movdqu STATE0_A, 1*16(OUT1)
+ movdqu STATE0_B, 1*16(OUT2)
+ movdqu STATE1_A, 0*16(OUT1)
+ movdqu STATE1_B, 0*16(OUT2)
+
+ mov %rbp, %rsp
+ pop %rbp
+ pop %rbx
+ RET
+SYM_FUNC_END(sha256_ni_finup2x)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/lib/crypto/x86/sha256-ssse3-asm.S b/lib/crypto/x86/sha256-ssse3-asm.S
new file mode 100644
index 000000000000..383b8eec7ebe
--- /dev/null
+++ b/lib/crypto/x86/sha256-ssse3-asm.S
@@ -0,0 +1,505 @@
+########################################################################
+# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define MOVDQ movdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+ MOVDQ \p2, \p1
+ pshufb \p3, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+
+SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm12
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+
+SRND = %rsi # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP = _INP_END + _INP_END_SIZE
+_XFER = _INP + _INP_SIZE
+_XMM_SAVE = _XFER + _XFER_SIZE
+STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+ ## compute s0 four at a time and s1 two at a time
+ ## compute W[-16] + W[-7] 4 at a time
+ movdqa X3, XTMP0
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ palignr $4, X2, XTMP0 # XTMP0 = W[-7]
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ movdqa X1, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ## compute s0
+ palignr $4, X0, XTMP1 # XTMP1 = W[-15]
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
+ movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pslld $(32-7), XTMP1 #
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ psrld $7, XTMP2 #
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ #
+ ROTATE_ARGS #
+ movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ pslld $(32-18), XTMP3 #
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ psrld $18, XTMP2 #
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP3, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
+ add y0, y2 # y2 = S1 + CH
+ add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pxor XTMP4, XTMP1 # XTMP1 = s0
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ ## compute low s1
+ pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
+ xor g, y2 # y2 = f^g
+ psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP3, XTMP2
+ add y0, y2 # y2 = S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ ## compute high s1
+ pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ #
+ ROTATE_ARGS #
+ movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
+ and e, y2 # y2 = (f^g)&e
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ pxor XTMP3, XTMP2 #
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+ add y0, y2 # y2 = S1 + CH
+ add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ pxor XTMP2, X0 # X0 = s1 {xDxC}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and e, y2 # y2 = (f^g)&e
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ add y0, y2 # y2 = S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ offset = \round * 4 + _XFER
+ add offset(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_ssse3(struct sha256_block_state *state,
+## const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_ssse3)
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ mov %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp
+ and $~15, %rsp
+
+ shl $6, NUM_BLKS # convert to bytes
+ add INP, NUM_BLKS
+ mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
+
+ ## load initial digest
+ mov 4*0(CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ movdqa _SHUF_00BA(%rip), SHUF_00BA
+ movdqa _SHUF_DC00(%rip), SHUF_DC00
+
+.Lloop0:
+ lea K256(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
+
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov $3, SRND
+.align 16
+.Lloop1:
+ movdqa (TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 1*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 2*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 3*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ add $4*16, TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ sub $1, SRND
+ jne .Lloop1
+
+ mov $2, SRND
+.Lloop2:
+ paddd (TBL), X0
+ movdqa X0, _XFER(%rsp)
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+ paddd 1*16(TBL), X1
+ movdqa X1, _XFER(%rsp)
+ add $2*16, TBL
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ movdqa X2, X0
+ movdqa X3, X1
+
+ sub $1, SRND
+ jne .Lloop2
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ mov _INP(%rsp), INP
+ add $64, INP
+ cmp _INP_END(%rsp), INP
+ jne .Lloop0
+
+ mov %rbp, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+
+ RET
+SYM_FUNC_END(sha256_transform_ssse3)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h
new file mode 100644
index 000000000000..38e33b22a092
--- /dev/null
+++ b/lib/crypto/x86/sha256.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);
+
+DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
+
+#define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \
+ asmlinkage void asm_fn(struct sha256_block_state *state, \
+ const u8 *data, size_t nblocks); \
+ static void c_fn(struct sha256_block_state *state, const u8 *data, \
+ size_t nblocks) \
+ { \
+ if (likely(irq_fpu_usable())) { \
+ kernel_fpu_begin(); \
+ asm_fn(state, data, nblocks); \
+ kernel_fpu_end(); \
+ } else { \
+ sha256_blocks_generic(state, data, nblocks); \
+ } \
+ }
+
+DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3);
+DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx);
+DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx);
+DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ static_call(sha256_blocks_x86)(state, data, nblocks);
+}
+
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, int len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, size_t len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ /*
+ * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+ * Further limit len to 65536 to avoid spending too long with preemption
+ * disabled. (Of course, in practice len is nearly always 4096 anyway.)
+ */
+ if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
+ len <= 65536 && likely(irq_fpu_usable())) {
+ kernel_fpu_begin();
+ sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
+ kernel_fpu_end();
+ kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+ kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+ return true;
+ }
+ return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+ return static_key_enabled(&have_sha_ni);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+ static_call_update(sha256_blocks_x86, sha256_blocks_ni);
+ static_branch_enable(&have_sha_ni);
+ } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ NULL) &&
+ boot_cpu_has(X86_FEATURE_AVX)) {
+ if (boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ static_call_update(sha256_blocks_x86,
+ sha256_blocks_avx2);
+ else
+ static_call_update(sha256_blocks_x86,
+ sha256_blocks_avx);
+ } else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+ static_call_update(sha256_blocks_x86, sha256_blocks_ssse3);
+ }
+}
diff --git a/lib/crypto/x86/sha512-avx-asm.S b/lib/crypto/x86/sha512-avx-asm.S
new file mode 100644
index 000000000000..7732aa8fd850
--- /dev/null
+++ b/lib/crypto/x86/sha512-avx-asm.S
@@ -0,0 +1,420 @@
+########################################################################
+# Implement fast SHA-512 with AVX instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest = %rdi
+# ARG2
+msg = %rsi
+# ARG3
+msglen = %rdx
+T1 = %rcx
+T2 = %r8
+a_64 = %r9
+b_64 = %r10
+c_64 = %r11
+d_64 = %r12
+e_64 = %r13
+f_64 = %r14
+g_64 = %r15
+h_64 = %rbx
+tmp0 = %rax
+
+# Local variables (stack frame)
+
+# Message Schedule
+W_SIZE = 80*8
+# W[t] + K[t] | W[t+1] + K[t+1]
+WK_SIZE = 2*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_size = frame_WK + WK_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i) 8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i) 8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i) 8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+ # Rotate symbols a..h right
+ TMP = h_64
+ h_64 = g_64
+ g_64 = f_64
+ f_64 = e_64
+ e_64 = d_64
+ d_64 = c_64
+ c_64 = b_64
+ b_64 = a_64
+ a_64 = TMP
+.endm
+
+.macro RORQ p1 p2
+ # shld is faster than ror on Sandybridge
+ shld $(64-\p2), \p1, \p1
+.endm
+
+.macro SHA512_Round rnd
+ # Compute Round %%t
+ mov f_64, T1 # T1 = f
+ mov e_64, tmp0 # tmp = e
+ xor g_64, T1 # T1 = f ^ g
+ RORQ tmp0, 23 # 41 # tmp = e ror 23
+ and e_64, T1 # T1 = (f ^ g) & e
+ xor e_64, tmp0 # tmp = (e ror 23) ^ e
+ xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ idx = \rnd
+ add WK_2(idx), T1 # W[t] + K[t] from message scheduler
+ RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4
+ xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov a_64, T2 # T2 = a
+ add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
+ RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov a_64, tmp0 # tmp = a
+ xor c_64, T2 # T2 = a ^ c
+ and c_64, tmp0 # tmp = a & c
+ and b_64, T2 # T2 = (a ^ c) & b
+ xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov a_64, tmp0 # tmp = a
+ RORQ tmp0, 5 # 39 # tmp = a ror 5
+ xor a_64, tmp0 # tmp = (a ror 5) ^ a
+ add T1, d_64 # e(next_state) = d + T1
+ RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6
+ xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
+ RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_avx rnd
+ # Compute rounds t-2 and t-1
+ # Compute message schedule QWORDS t and t+1
+
+ # Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ # scheduler.
+ # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
+ # They are then added to their respective SHA512 constants at
+ # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
+ # For brievity, the comments following vectored instructions only refer to
+ # the first of a pair of QWORDS.
+ # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+ # The computation of the message schedule and the rounds are tightly
+ # stitched to take advantage of instruction-level parallelism.
+
+ idx = \rnd - 2
+ vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2]
+ idx = \rnd - 15
+ vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
+ mov f_64, T1
+ vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61
+ mov e_64, tmp0
+ vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1
+ xor g_64, T1
+ RORQ tmp0, 23 # 41
+ vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19
+ and e_64, T1
+ xor e_64, tmp0
+ vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+ xor g_64, T1
+ idx = \rnd
+ add WK_2(idx), T1#
+ vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8
+ RORQ tmp0, 4 # 18
+ vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6
+ xor e_64, tmp0
+ mov a_64, T2
+ add h_64, T1
+ vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+ RORQ tmp0, 14 # 14
+ add tmp0, T1
+ vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7
+ mov a_64, tmp0
+ xor c_64, T2
+ vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3
+ and c_64, tmp0
+ and b_64, T2
+ vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+ xor tmp0, T2
+ mov a_64, tmp0
+ vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63
+ RORQ tmp0, 5 # 39
+ vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+ xor a_64, tmp0
+ add T1, d_64
+ RORQ tmp0, 6 # 34
+ xor a_64, tmp0
+ vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
+ # W[t-15]>>7 ^ W[t-15]<<63
+ lea (T1, T2), h_64
+ RORQ tmp0, 28 # 28
+ vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25
+ add tmp0, h_64
+ RotateState
+ vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
+ # W[t-2]<<25
+ mov f_64, T1
+ vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2])
+ mov e_64, tmp0
+ xor g_64, T1
+ idx = \rnd - 16
+ vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16]
+ idx = \rnd - 7
+ vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
+ RORQ tmp0, 23 # 41
+ and e_64, T1
+ xor e_64, tmp0
+ xor g_64, T1
+ vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56
+ idx = \rnd + 1
+ add WK_2(idx), T1
+ vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15])
+ RORQ tmp0, 4 # 18
+ vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+ xor e_64, tmp0
+ vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
+ # s0(W[t-15]) + W[t-16]
+ mov a_64, T2
+ add h_64, T1
+ RORQ tmp0, 14 # 14
+ add tmp0, T1
+ idx = \rnd
+ vmovdqa %xmm0, W_t(idx) # Store W[t]
+ vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t]
+ vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
+ mov a_64, tmp0
+ xor c_64, T2
+ and c_64, tmp0
+ and b_64, T2
+ xor tmp0, T2
+ mov a_64, tmp0
+ RORQ tmp0, 5 # 39
+ xor a_64, tmp0
+ add T1, d_64
+ RORQ tmp0, 6 # 34
+ xor a_64, tmp0
+ lea (T1, T2), h_64
+ RORQ tmp0, 28 # 28
+ add tmp0, h_64
+ RotateState
+.endm
+
+########################################################################
+# void sha512_transform_avx(struct sha512_block_state *state,
+# const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks. Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_avx)
+
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # Allocate Stack Space
+ push %rbp
+ mov %rsp, %rbp
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+
+.Lupdateblock:
+
+ # Load state variables
+ mov DIGEST(0), a_64
+ mov DIGEST(1), b_64
+ mov DIGEST(2), c_64
+ mov DIGEST(3), d_64
+ mov DIGEST(4), e_64
+ mov DIGEST(5), f_64
+ mov DIGEST(6), g_64
+ mov DIGEST(7), h_64
+
+ t = 0
+ .rept 80/2 + 1
+ # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ # +1 iteration because the scheduler leads hashing by 1 iteration
+ .if t < 2
+ # BSWAP 2 QWORDS
+ vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1
+ vmovdqu MSG(t), %xmm0
+ vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
+ vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
+ vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+ vmovdqa %xmm0, WK_2(t) # Store into WK for rounds
+ .elseif t < 16
+ # BSWAP 2 QWORDS# Compute 2 Rounds
+ vmovdqu MSG(t), %xmm0
+ vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
+ SHA512_Round t-2 # Round t-2
+ vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
+ vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+ SHA512_Round t-1 # Round t-1
+ vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK
+ .elseif t < 79
+ # Schedule 2 QWORDS# Compute 2 Rounds
+ SHA512_2Sched_2Round_avx t
+ .else
+ # Compute 2 Rounds
+ SHA512_Round t-2
+ SHA512_Round t-1
+ .endif
+ t = t+2
+ .endr
+
+ # Update digest
+ add a_64, DIGEST(0)
+ add b_64, DIGEST(1)
+ add c_64, DIGEST(2)
+ add d_64, DIGEST(3)
+ add e_64, DIGEST(4)
+ add f_64, DIGEST(5)
+ add g_64, DIGEST(6)
+ add h_64, DIGEST(7)
+
+ # Advance to next message block
+ add $16*8, msg
+ dec msglen
+ jnz .Lupdateblock
+
+ # Restore Stack Pointer
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ RET
+SYM_FUNC_END(sha512_transform_avx)
+
+########################################################################
+### Binary Data
+
+.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/lib/crypto/x86/sha512-avx2-asm.S b/lib/crypto/x86/sha512-avx2-asm.S
new file mode 100644
index 000000000000..22bdbfd899d0
--- /dev/null
+++ b/lib/crypto/x86/sha512-avx2-asm.S
@@ -0,0 +1,748 @@
+########################################################################
+# Implement fast SHA-512 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 blocks at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+Y_0 = %ymm4
+Y_1 = %ymm5
+Y_2 = %ymm6
+Y_3 = %ymm7
+
+YTMP0 = %ymm0
+YTMP1 = %ymm1
+YTMP2 = %ymm2
+YTMP3 = %ymm3
+YTMP4 = %ymm8
+XFER = YTMP0
+
+BYTE_FLIP_MASK = %ymm9
+
+# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
+CTX1 = %rdi
+CTX2 = %r12
+# 2nd arg
+INP = %rsi
+# 3rd arg
+NUM_BLKS = %rdx
+
+c = %rcx
+d = %r8
+e = %rdx
+y3 = %rsi
+
+TBL = %rdi # clobbers CTX1
+
+a = %rax
+b = %rbx
+
+f = %r9
+g = %r10
+h = %r11
+old_h = %r11
+
+T1 = %r12 # clobbers CTX2
+y0 = %r13
+y1 = %r14
+y2 = %r15
+
+# Local variables (stack frame)
+XFER_SIZE = 4*8
+SRND_SIZE = 1*8
+INP_SIZE = 1*8
+INPEND_SIZE = 1*8
+CTX_SIZE = 1*8
+
+frame_XFER = 0
+frame_SRND = frame_XFER + XFER_SIZE
+frame_INP = frame_SRND + SRND_SIZE
+frame_INPEND = frame_INP + INP_SIZE
+frame_CTX = frame_INPEND + INPEND_SIZE
+frame_size = frame_CTX + CTX_SIZE
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+
+# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
+# Load ymm with mem and byte swap each dword
+.macro COPY_YMM_AND_BSWAP p1 p2 p3
+ VMOVDQ \p2, \p1
+ vpshufb \p3, \p1, \p1
+.endm
+# rotate_Ys
+# Rotate values of symbols Y0...Y3
+.macro rotate_Ys
+ Y_ = Y_0
+ Y_0 = Y_1
+ Y_1 = Y_2
+ Y_2 = Y_3
+ Y_3 = Y_
+.endm
+
+# RotateState
+.macro RotateState
+ # Rotate symbols a..h right
+ old_h = h
+ TMP_ = h
+ h = g
+ g = f
+ f = e
+ e = d
+ d = c
+ c = b
+ b = a
+ a = TMP_
+.endm
+
+# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
+# YDST = {YSRC1, YSRC2} >> RVAL*8
+.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
+ vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
+ vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+################################### RND N + 0 #########################################
+
+ # Extract w[t-7]
+ MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
+ # Calculate w[t-16] + w[t-7]
+ vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
+ # Extract w[t-15]
+ MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
+
+ # Calculate sigma0
+
+ # Calculate w[t-15] ror 1
+ vpsrlq $1, YTMP1, YTMP2
+ vpsllq $(64-1), YTMP1, YTMP3
+ vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
+ # Calculate w[t-15] shr 7
+ vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add frame_XFER(%rsp),h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+
+ and e, y2 # y2 = (f^g)&e # CH
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+
+ add y0, y2 # y2 = S1 + CH # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+################################### RND N + 1 #########################################
+
+ # Calculate w[t-15] ror 8
+ vpsrlq $8, YTMP1, YTMP2
+ vpsllq $(64-8), YTMP1, YTMP1
+ vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
+ # XOR the three components
+ vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
+ vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
+
+
+ # Add three components, w[t-16], w[t-7] and sigma0
+ vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
+ # Move to appropriate lanes for calculating w[16] and w[17]
+ vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
+ # Move to appropriate lanes for calculating w[18] and w[19]
+ vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
+
+ # Calculate w[16] and w[17] in both 128 bit lanes
+
+ # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+ vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
+ vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
+
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+
+################################### RND N + 2 #########################################
+
+ vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
+ vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+ vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
+ vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
+ # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
+
+ # Add sigma1 to the other compunents to get w[16] and w[17]
+ vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
+
+ # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+ vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
+
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ xor g, y2 # y2 = f^g # CH
+
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+################################### RND N + 3 #########################################
+
+ vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
+ vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+ vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
+ vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
+ # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
+
+ # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
+ # to newly calculated sigma1 to get w[18] and w[19]
+ vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
+
+ # Form w[19, w[18], w17], w[16]
+ vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ add y0, y2 # y2 = S1 + CH # --
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+
+ add y1, h # h = k + w + h + S0 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+ rotate_Ys
+.endm
+
+.macro DO_4ROUNDS
+
+################################### RND N + 0 #########################################
+
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 1 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 2 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 3 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+.endm
+
+########################################################################
+# void sha512_transform_rorx(struct sha512_block_state *state,
+# const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks. Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_rorx)
+
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # Allocate Stack Space
+ push %rbp
+ mov %rsp, %rbp
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+
+ shl $7, NUM_BLKS # convert to bytes
+ add INP, NUM_BLKS # pointer to end of data
+ mov NUM_BLKS, frame_INPEND(%rsp)
+
+ ## load initial digest
+ mov 8*0(CTX1), a
+ mov 8*1(CTX1), b
+ mov 8*2(CTX1), c
+ mov 8*3(CTX1), d
+ mov 8*4(CTX1), e
+ mov 8*5(CTX1), f
+ mov 8*6(CTX1), g
+ mov 8*7(CTX1), h
+
+ # save %rdi (CTX) before it gets clobbered
+ mov %rdi, frame_CTX(%rsp)
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+
+.Lloop0:
+ lea K512(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
+
+ mov INP, frame_INP(%rsp)
+
+ ## schedule 64 input dwords, by doing 12 rounds of 4 each
+ movq $4, frame_SRND(%rsp)
+
+.align 16
+.Lloop1:
+ vpaddq (TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 1*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 2*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 3*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ add $(4*32), TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ subq $1, frame_SRND(%rsp)
+ jne .Lloop1
+
+ movq $2, frame_SRND(%rsp)
+.Lloop2:
+ vpaddq (TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ DO_4ROUNDS
+ vpaddq 1*32(TBL), Y_1, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ add $(2*32), TBL
+ DO_4ROUNDS
+
+ vmovdqa Y_2, Y_0
+ vmovdqa Y_3, Y_1
+
+ subq $1, frame_SRND(%rsp)
+ jne .Lloop2
+
+ mov frame_CTX(%rsp), CTX2
+ addm 8*0(CTX2), a
+ addm 8*1(CTX2), b
+ addm 8*2(CTX2), c
+ addm 8*3(CTX2), d
+ addm 8*4(CTX2), e
+ addm 8*5(CTX2), f
+ addm 8*6(CTX2), g
+ addm 8*7(CTX2), h
+
+ mov frame_INP(%rsp), INP
+ add $128, INP
+ cmp frame_INPEND(%rsp), INP
+ jne .Lloop0
+
+ # Restore Stack Pointer
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ vzeroupper
+ RET
+SYM_FUNC_END(sha512_transform_rorx)
+
+########################################################################
+### Binary Data
+
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+ .octa 0x18191a1b1c1d1e1f1011121314151617
+
+.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
+.align 32
+MASK_YMM_LO:
+ .octa 0x00000000000000000000000000000000
+ .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha512-ssse3-asm.S b/lib/crypto/x86/sha512-ssse3-asm.S
new file mode 100644
index 000000000000..4cae7445b2a8
--- /dev/null
+++ b/lib/crypto/x86/sha512-ssse3-asm.S
@@ -0,0 +1,419 @@
+########################################################################
+# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest = %rdi
+# ARG2
+msg = %rsi
+# ARG3
+msglen = %rdx
+T1 = %rcx
+T2 = %r8
+a_64 = %r9
+b_64 = %r10
+c_64 = %r11
+d_64 = %r12
+e_64 = %r13
+f_64 = %r14
+g_64 = %r15
+h_64 = %rbx
+tmp0 = %rax
+
+# Local variables (stack frame)
+
+W_SIZE = 80*8
+WK_SIZE = 2*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_size = frame_WK + WK_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i) 8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i) 8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i) 8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+ # Rotate symbols a..h right
+ TMP = h_64
+ h_64 = g_64
+ g_64 = f_64
+ f_64 = e_64
+ e_64 = d_64
+ d_64 = c_64
+ c_64 = b_64
+ b_64 = a_64
+ a_64 = TMP
+.endm
+
+.macro SHA512_Round rnd
+
+ # Compute Round %%t
+ mov f_64, T1 # T1 = f
+ mov e_64, tmp0 # tmp = e
+ xor g_64, T1 # T1 = f ^ g
+ ror $23, tmp0 # 41 # tmp = e ror 23
+ and e_64, T1 # T1 = (f ^ g) & e
+ xor e_64, tmp0 # tmp = (e ror 23) ^ e
+ xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ idx = \rnd
+ add WK_2(idx), T1 # W[t] + K[t] from message scheduler
+ ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4
+ xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov a_64, T2 # T2 = a
+ add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
+ ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov a_64, tmp0 # tmp = a
+ xor c_64, T2 # T2 = a ^ c
+ and c_64, tmp0 # tmp = a & c
+ and b_64, T2 # T2 = (a ^ c) & b
+ xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov a_64, tmp0 # tmp = a
+ ror $5, tmp0 # 39 # tmp = a ror 5
+ xor a_64, tmp0 # tmp = (a ror 5) ^ a
+ add T1, d_64 # e(next_state) = d + T1
+ ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6
+ xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
+ ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_sse rnd
+
+ # Compute rounds t-2 and t-1
+ # Compute message schedule QWORDS t and t+1
+
+ # Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ # scheduler.
+ # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+ # They are then added to their respective SHA512 constants at
+ # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+ # For brievity, the comments following vectored instructions only refer to
+ # the first of a pair of QWORDS.
+ # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+ # The computation of the message schedule and the rounds are tightly
+ # stitched to take advantage of instruction-level parallelism.
+ # For clarity, integer instructions (for the rounds calculation) are indented
+ # by one tab. Vectored instructions (for the message scheduler) are indented
+ # by two tabs.
+
+ mov f_64, T1
+ idx = \rnd -2
+ movdqa W_t(idx), %xmm2 # XMM2 = W[t-2]
+ xor g_64, T1
+ and e_64, T1
+ movdqa %xmm2, %xmm0 # XMM0 = W[t-2]
+ xor g_64, T1
+ idx = \rnd
+ add WK_2(idx), T1
+ idx = \rnd - 15
+ movdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
+ mov e_64, tmp0
+ ror $23, tmp0 # 41
+ movdqa %xmm5, %xmm3 # XMM3 = W[t-15]
+ xor e_64, tmp0
+ ror $4, tmp0 # 18
+ psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42
+ xor e_64, tmp0
+ ror $14, tmp0 # 14
+ psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1
+ add tmp0, T1
+ add h_64, T1
+ pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2]
+ mov a_64, T2
+ xor c_64, T2
+ pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15]
+ and b_64, T2
+ mov a_64, tmp0
+ psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+ and c_64, tmp0
+ xor tmp0, T2
+ psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+ mov a_64, tmp0
+ ror $5, tmp0 # 39
+ pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+ xor a_64, tmp0
+ ror $6, tmp0 # 34
+ pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+ xor a_64, tmp0
+ ror $28, tmp0 # 28
+ psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+ add tmp0, T2
+ add T1, d_64
+ psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+ lea (T1, T2), h_64
+ RotateState
+ movdqa %xmm2, %xmm1 # XMM1 = W[t-2]
+ mov f_64, T1
+ xor g_64, T1
+ movdqa %xmm5, %xmm4 # XMM4 = W[t-15]
+ and e_64, T1
+ xor g_64, T1
+ psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42
+ idx = \rnd + 1
+ add WK_2(idx), T1
+ mov e_64, tmp0
+ psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7
+ ror $23, tmp0 # 41
+ xor e_64, tmp0
+ pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2]
+ ror $4, tmp0 # 18
+ xor e_64, tmp0
+ pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15]
+ ror $14, tmp0 # 14
+ add tmp0, T1
+ psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+ add h_64, T1
+ mov a_64, T2
+ psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+ xor c_64, T2
+ and b_64, T2
+ pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2])
+ mov a_64, tmp0
+ and c_64, tmp0
+ idx = \rnd - 7
+ movdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
+ xor tmp0, T2
+ pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15])
+ mov a_64, tmp0
+ paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15])
+ ror $5, tmp0 # 39
+ idx =\rnd-16
+ paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+ xor a_64, tmp0
+ paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+ ror $6, tmp0 # 34
+ movdqa %xmm0, W_t(\rnd) # Store scheduled qwords
+ xor a_64, tmp0
+ paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t]
+ ror $28, tmp0 # 28
+ idx = \rnd
+ movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
+ add tmp0, T2
+ add T1, d_64
+ lea (T1, T2), h_64
+ RotateState
+.endm
+
+########################################################################
+# void sha512_transform_ssse3(struct sha512_block_state *state,
+# const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks. Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_ssse3)
+
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # Allocate Stack Space
+ push %rbp
+ mov %rsp, %rbp
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+
+.Lupdateblock:
+
+# Load state variables
+ mov DIGEST(0), a_64
+ mov DIGEST(1), b_64
+ mov DIGEST(2), c_64
+ mov DIGEST(3), d_64
+ mov DIGEST(4), e_64
+ mov DIGEST(5), f_64
+ mov DIGEST(6), g_64
+ mov DIGEST(7), h_64
+
+ t = 0
+ .rept 80/2 + 1
+ # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ # +1 iteration because the scheduler leads hashing by 1 iteration
+ .if t < 2
+ # BSWAP 2 QWORDS
+ movdqa XMM_QWORD_BSWAP(%rip), %xmm1
+ movdqu MSG(t), %xmm0
+ pshufb %xmm1, %xmm0 # BSWAP
+ movdqa %xmm0, W_t(t) # Store Scheduled Pair
+ paddq K_t(t), %xmm0 # Compute W[t]+K[t]
+ movdqa %xmm0, WK_2(t) # Store into WK for rounds
+ .elseif t < 16
+ # BSWAP 2 QWORDS# Compute 2 Rounds
+ movdqu MSG(t), %xmm0
+ pshufb %xmm1, %xmm0 # BSWAP
+ SHA512_Round t-2 # Round t-2
+ movdqa %xmm0, W_t(t) # Store Scheduled Pair
+ paddq K_t(t), %xmm0 # Compute W[t]+K[t]
+ SHA512_Round t-1 # Round t-1
+ movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK
+ .elseif t < 79
+ # Schedule 2 QWORDS# Compute 2 Rounds
+ SHA512_2Sched_2Round_sse t
+ .else
+ # Compute 2 Rounds
+ SHA512_Round t-2
+ SHA512_Round t-1
+ .endif
+ t = t+2
+ .endr
+
+ # Update digest
+ add a_64, DIGEST(0)
+ add b_64, DIGEST(1)
+ add c_64, DIGEST(2)
+ add d_64, DIGEST(3)
+ add e_64, DIGEST(4)
+ add f_64, DIGEST(5)
+ add g_64, DIGEST(6)
+ add h_64, DIGEST(7)
+
+ # Advance to next message block
+ add $16*8, msg
+ dec msglen
+ jnz .Lupdateblock
+
+ # Restore Stack Pointer
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ RET
+SYM_FUNC_END(sha512_transform_ssse3)
+
+########################################################################
+### Binary Data
+
+.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/lib/crypto/x86/sha512.h b/lib/crypto/x86/sha512.h
new file mode 100644
index 000000000000..0213c70cedd0
--- /dev/null
+++ b/lib/crypto/x86/sha512.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * x86-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic);
+
+#define DEFINE_X86_SHA512_FN(c_fn, asm_fn) \
+ asmlinkage void asm_fn(struct sha512_block_state *state, \
+ const u8 *data, size_t nblocks); \
+ static void c_fn(struct sha512_block_state *state, const u8 *data, \
+ size_t nblocks) \
+ { \
+ if (likely(irq_fpu_usable())) { \
+ kernel_fpu_begin(); \
+ asm_fn(state, data, nblocks); \
+ kernel_fpu_end(); \
+ } else { \
+ sha512_blocks_generic(state, data, nblocks); \
+ } \
+ }
+
+DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3);
+DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx);
+DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ static_call(sha512_blocks_x86)(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) &&
+ boot_cpu_has(X86_FEATURE_AVX)) {
+ if (boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ static_call_update(sha512_blocks_x86,
+ sha512_blocks_avx2);
+ else
+ static_call_update(sha512_blocks_x86,
+ sha512_blocks_avx);
+ } else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+ static_call_update(sha512_blocks_x86, sha512_blocks_ssse3);
+ }
+}
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 7f50c4480a4e..ecf8e7f978e3 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -714,13 +714,13 @@ static void debug_objects_fill_pool(void)
* raw_spinlock_t are basically the same type and this lock-type
* inversion works just fine.
*/
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) {
/*
* Annotate away the spinlock_t inside raw_spinlock_t warning
- * by temporarily raising the wait-type to WAIT_SLEEP, matching
+ * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching
* the preemptible() condition above.
*/
- static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP);
+ static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_CONFIG);
lock_map_acquire_try(&fill_pool_map);
fill_pool();
lock_map_release(&fill_pool_map);
diff --git a/lib/decompress.c b/lib/decompress.c
index ab3fc90ffc64..7785471586c6 100644
--- a/lib/decompress.c
+++ b/lib/decompress.c
@@ -49,15 +49,15 @@ struct compress_format {
};
static const struct compress_format compressed_formats[] __initconst = {
- { {0x1f, 0x8b}, "gzip", gunzip },
- { {0x1f, 0x9e}, "gzip", gunzip },
- { {0x42, 0x5a}, "bzip2", bunzip2 },
- { {0x5d, 0x00}, "lzma", unlzma },
- { {0xfd, 0x37}, "xz", unxz },
- { {0x89, 0x4c}, "lzo", unlzo },
- { {0x02, 0x21}, "lz4", unlz4 },
- { {0x28, 0xb5}, "zstd", unzstd },
- { {0, 0}, NULL, NULL }
+ { .magic = {0x1f, 0x8b}, .name = "gzip", .decompressor = gunzip },
+ { .magic = {0x1f, 0x9e}, .name = "gzip", .decompressor = gunzip },
+ { .magic = {0x42, 0x5a}, .name = "bzip2", .decompressor = bunzip2 },
+ { .magic = {0x5d, 0x00}, .name = "lzma", .decompressor = unlzma },
+ { .magic = {0xfd, 0x37}, .name = "xz", .decompressor = unxz },
+ { .magic = {0x89, 0x4c}, .name = "lzo", .decompressor = unlzo },
+ { .magic = {0x02, 0x21}, .name = "lz4", .decompressor = unlz4 },
+ { .magic = {0x28, 0xb5}, .name = "zstd", .decompressor = unzstd },
+ { /* sentinel */ }
};
decompress_fn __init decompress_method(const unsigned char *inbuf, long len,
@@ -73,11 +73,10 @@ decompress_fn __init decompress_method(const unsigned char *inbuf, long len,
pr_debug("Compressed data magic: %#.2x %#.2x\n", inbuf[0], inbuf[1]);
- for (cf = compressed_formats; cf->name; cf++) {
+ for (cf = compressed_formats; cf->name; cf++)
if (!memcmp(inbuf, cf->magic, 2))
break;
- }
if (name)
*name = cf->name;
return cf->decompressor;
diff --git a/lib/digsig.c b/lib/digsig.c
index 04b5e55ed95f..9dd319c12c7d 100644
--- a/lib/digsig.c
+++ b/lib/digsig.c
@@ -18,15 +18,11 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/key.h>
-#include <linux/crypto.h>
-#include <crypto/hash.h>
#include <crypto/sha1.h>
#include <keys/user-type.h>
#include <linux/mpi.h>
#include <linux/digsig.h>
-static struct crypto_shash *shash;
-
static const char *pkcs_1_v1_5_decode_emsa(const unsigned char *msg,
unsigned long msglen,
unsigned long modulus_bitlen,
@@ -159,7 +155,6 @@ static int digsig_verify_rsa(struct key *key,
len = mlen;
head = len - l;
- memset(out1, 0, head);
memcpy(out1 + head, p, l);
kfree(p);
@@ -199,12 +194,12 @@ err1:
int digsig_verify(struct key *keyring, const char *sig, int siglen,
const char *data, int datalen)
{
- int err = -ENOMEM;
struct signature_hdr *sh = (struct signature_hdr *)sig;
- struct shash_desc *desc = NULL;
+ struct sha1_ctx ctx;
unsigned char hash[SHA1_DIGEST_SIZE];
struct key *key;
char name[20];
+ int err;
if (siglen < sizeof(*sh) + 2)
return -EINVAL;
@@ -231,49 +226,19 @@ int digsig_verify(struct key *keyring, const char *sig, int siglen,
return PTR_ERR(key);
}
- desc = kzalloc(sizeof(*desc) + crypto_shash_descsize(shash),
- GFP_KERNEL);
- if (!desc)
- goto err;
-
- desc->tfm = shash;
-
- crypto_shash_init(desc);
- crypto_shash_update(desc, data, datalen);
- crypto_shash_update(desc, sig, sizeof(*sh));
- crypto_shash_final(desc, hash);
-
- kfree(desc);
+ sha1_init(&ctx);
+ sha1_update(&ctx, data, datalen);
+ sha1_update(&ctx, sig, sizeof(*sh));
+ sha1_final(&ctx, hash);
/* pass signature mpis address */
err = digsig_verify_rsa(key, sig + sizeof(*sh), siglen - sizeof(*sh),
hash, sizeof(hash));
-err:
key_put(key);
return err ? -EINVAL : 0;
}
EXPORT_SYMBOL_GPL(digsig_verify);
-static int __init digsig_init(void)
-{
- shash = crypto_alloc_shash("sha1", 0, 0);
- if (IS_ERR(shash)) {
- pr_err("shash allocation failed\n");
- return PTR_ERR(shash);
- }
-
- return 0;
-
-}
-
-static void __exit digsig_cleanup(void)
-{
- crypto_free_shash(shash);
-}
-
-module_init(digsig_init);
-module_exit(digsig_cleanup);
-
MODULE_LICENSE("GPL");
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index b3a85fe8b673..f0c78b5b5324 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -102,7 +102,7 @@ static void __dump_stack(const char *log_lvl)
*/
asmlinkage __visible void dump_stack_lvl(const char *log_lvl)
{
- bool in_panic = this_cpu_in_panic();
+ bool in_panic = panic_on_this_cpu();
unsigned long flags;
/*
diff --git a/lib/fault-inject-usercopy.c b/lib/fault-inject-usercopy.c
index 77558b6c29ca..75403ec50f49 100644
--- a/lib/fault-inject-usercopy.c
+++ b/lib/fault-inject-usercopy.c
@@ -22,10 +22,8 @@ static int __init fail_usercopy_debugfs(void)
dir = fault_create_debugfs_attr("fail_usercopy", NULL,
&fail_usercopy.attr);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
- return 0;
+ return PTR_ERR_OR_ZERO(dir);
}
late_initcall(fail_usercopy_debugfs);
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 06b6342aa3ae..d4b5a29e3e72 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -18,6 +18,7 @@
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/swab.h>
+#include <linux/random.h>
/*
* Common helper for find_bit() function family
@@ -291,3 +292,26 @@ EXPORT_SYMBOL(_find_next_bit_le);
#endif
#endif /* __BIG_ENDIAN */
+
+/**
+ * find_random_bit - find a set bit at random position
+ * @addr: The address to base the search on
+ * @size: The bitmap size in bits
+ *
+ * Returns: a position of a random set bit; >= @size otherwise
+ */
+unsigned long find_random_bit(const unsigned long *addr, unsigned long size)
+{
+ int w = bitmap_weight(addr, size);
+
+ switch (w) {
+ case 0:
+ return size;
+ case 1:
+ /* Performance trick for single-bit bitmaps */
+ return find_first_bit(addr, size);
+ default:
+ return find_nth_bit(addr, size, get_random_u32_below(w));
+ }
+}
+EXPORT_SYMBOL(find_random_bit);
diff --git a/lib/find_bit_benchmark_rust.rs b/lib/find_bit_benchmark_rust.rs
new file mode 100644
index 000000000000..6bdc51de2f30
--- /dev/null
+++ b/lib/find_bit_benchmark_rust.rs
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+//! Benchmark for find_bit-like methods in Bitmap Rust API.
+
+use kernel::alloc::flags::GFP_KERNEL;
+use kernel::bindings;
+use kernel::bitmap::BitmapVec;
+use kernel::error::{code, Result};
+use kernel::prelude::module;
+use kernel::time::{Instant, Monotonic};
+use kernel::ThisModule;
+use kernel::{pr_cont, pr_err};
+
+const BITMAP_LEN: usize = 4096 * 8 * 10;
+// Reciprocal of the fraction of bits that are set in sparse bitmap.
+const SPARSENESS: usize = 500;
+
+/// Test module that benchmarks performance of traversing bitmaps.
+struct Benchmark();
+
+fn test_next_bit(bitmap: &BitmapVec) {
+ let time = Instant::<Monotonic>::now();
+ let mut cnt = 0;
+ let mut i = 0;
+
+ while let Some(index) = bitmap.next_bit(i) {
+ cnt += 1;
+ i = index + 1;
+ // CONFIG_RUST_BITMAP_HARDENED enforces strict bounds.
+ if i == BITMAP_LEN {
+ break;
+ }
+ }
+
+ let delta = time.elapsed();
+ pr_cont!(
+ "\nnext_bit: {:18} ns, {:6} iterations",
+ delta.as_nanos(),
+ cnt
+ );
+}
+
+fn test_next_zero_bit(bitmap: &BitmapVec) {
+ let time = Instant::<Monotonic>::now();
+ let mut cnt = 0;
+ let mut i = 0;
+
+ while let Some(index) = bitmap.next_zero_bit(i) {
+ cnt += 1;
+ i = index + 1;
+ // CONFIG_RUST_BITMAP_HARDENED enforces strict bounds.
+ if i == BITMAP_LEN {
+ break;
+ }
+ }
+
+ let delta = time.elapsed();
+ pr_cont!(
+ "\nnext_zero_bit: {:18} ns, {:6} iterations",
+ delta.as_nanos(),
+ cnt
+ );
+}
+
+fn find_bit_test() {
+ pr_err!("Benchmark");
+ pr_cont!("\nStart testing find_bit() Rust with random-filled bitmap");
+
+ let mut bitmap = BitmapVec::new(BITMAP_LEN, GFP_KERNEL).expect("alloc bitmap failed");
+ bitmap.fill_random();
+
+ test_next_bit(&bitmap);
+ test_next_zero_bit(&bitmap);
+
+ pr_cont!("\nStart testing find_bit() Rust with sparse bitmap");
+
+ let mut bitmap = BitmapVec::new(BITMAP_LEN, GFP_KERNEL).expect("alloc sparse bitmap failed");
+ let nbits = BITMAP_LEN / SPARSENESS;
+ for _i in 0..nbits {
+ // SAFETY: __get_random_u32_below is safe to call with any u32 argument.
+ let bit =
+ unsafe { bindings::__get_random_u32_below(BITMAP_LEN.try_into().unwrap()) as usize };
+ bitmap.set_bit(bit);
+ }
+
+ test_next_bit(&bitmap);
+ test_next_zero_bit(&bitmap);
+ pr_cont!("\n");
+}
+
+impl kernel::Module for Benchmark {
+ fn init(_module: &'static ThisModule) -> Result<Self> {
+ find_bit_test();
+ // Return error so test module can be inserted again without rmmod.
+ Err(code::EINVAL)
+ }
+}
+
+module! {
+ type: Benchmark,
+ name: "find_bit_benchmark_rust",
+ authors: ["Burak Emir <bqe@google.com>"],
+ description: "Module with benchmark for bitmap Rust API",
+ license: "GPL v2",
+}
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 4fa5635bf81b..841f29783833 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -899,8 +899,11 @@ struct gen_pool *of_gen_pool_get(struct device_node *np,
if (!name)
name = of_node_full_name(np_pool);
}
- if (pdev)
+ if (pdev) {
pool = gen_pool_get(&pdev->dev, name);
+ put_device(&pdev->dev);
+ }
+
of_node_put(np_pool);
return pool;
diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index ee272c4cefcc..6d08ac05f371 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -332,9 +332,11 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
/**
* group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
* @numgrps: number of groups
+ * @nummasks: number of initialized cpumasks
*
* Return: cpumask array if successful, NULL otherwise. And each element
- * includes CPUs assigned to this group
+ * includes CPUs assigned to this group. nummasks contains the number
+ * of initialized masks which can be less than numgrps.
*
* Try to put close CPUs from viewpoint of CPU and NUMA locality into
* same group, and run two-stage grouping:
@@ -344,7 +346,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
* We guarantee in the resulted grouping that all CPUs are covered, and
* no same CPU is assigned to multiple groups
*/
-struct cpumask *group_cpus_evenly(unsigned int numgrps)
+struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
{
unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
cpumask_var_t *node_to_cpumask;
@@ -352,6 +354,9 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
int ret = -ENOMEM;
struct cpumask *masks = NULL;
+ if (numgrps == 0)
+ return NULL;
+
if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
return NULL;
@@ -386,7 +391,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
npresmsk, nmsk, masks);
if (ret < 0)
- goto fail_build_affinity;
+ goto fail_node_to_cpumask;
nr_present = ret;
/*
@@ -405,10 +410,6 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
if (ret >= 0)
nr_others = ret;
- fail_build_affinity:
- if (ret >= 0)
- WARN_ON(nr_present + nr_others < numgrps);
-
fail_node_to_cpumask:
free_node_to_cpumask(node_to_cpumask);
@@ -421,18 +422,24 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
kfree(masks);
return NULL;
}
+ *nummasks = min(nr_present + nr_others, numgrps);
return masks;
}
#else /* CONFIG_SMP */
-struct cpumask *group_cpus_evenly(unsigned int numgrps)
+struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
{
- struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
+ struct cpumask *masks;
+
+ if (numgrps == 0)
+ return NULL;
+ masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
if (!masks)
return NULL;
/* assign all CPUs(cpu 0) to the 1st group only */
cpumask_copy(&masks[0], cpu_possible_mask);
+ *nummasks = 1;
return masks;
}
#endif /* CONFIG_SMP */
diff --git a/lib/interval_tree.c b/lib/interval_tree.c
index 324766e9bf63..9ceb084b6b4e 100644
--- a/lib/interval_tree.c
+++ b/lib/interval_tree.c
@@ -13,6 +13,7 @@ INTERVAL_TREE_DEFINE(struct interval_tree_node, rb,
EXPORT_SYMBOL_GPL(interval_tree_insert);
EXPORT_SYMBOL_GPL(interval_tree_remove);
+EXPORT_SYMBOL_GPL(interval_tree_subtree_search);
EXPORT_SYMBOL_GPL(interval_tree_iter_first);
EXPORT_SYMBOL_GPL(interval_tree_iter_next);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f9193f952f49..896760bad455 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -49,12 +49,24 @@ size_t copy_from_user_iter(void __user *iter_from, size_t progress,
if (should_fail_usercopy())
return len;
- if (access_ok(iter_from, len)) {
- to += progress;
- instrument_copy_from_user_before(to, iter_from, len);
- res = raw_copy_from_user(to, iter_from, len);
- instrument_copy_from_user_after(to, iter_from, len, res);
+ if (can_do_masked_user_access()) {
+ iter_from = mask_user_address(iter_from);
+ } else {
+ if (!access_ok(iter_from, len))
+ return res;
+
+ /*
+ * Ensure that bad access_ok() speculation will not
+ * lead to nasty side effects *after* the copy is
+ * finished:
+ */
+ barrier_nospec();
}
+ to += progress;
+ instrument_copy_from_user_before(to, iter_from, len);
+ res = raw_copy_from_user(to, iter_from, len);
+ instrument_copy_from_user_after(to, iter_from, len, res);
+
return res;
}
@@ -784,101 +796,6 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
}
EXPORT_SYMBOL(iov_iter_discard);
-static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
- unsigned len_mask)
-{
- const struct iovec *iov = iter_iov(i);
- size_t size = i->count;
- size_t skip = i->iov_offset;
-
- do {
- size_t len = iov->iov_len - skip;
-
- if (len > size)
- len = size;
- if (len & len_mask)
- return false;
- if ((unsigned long)(iov->iov_base + skip) & addr_mask)
- return false;
-
- iov++;
- size -= len;
- skip = 0;
- } while (size);
-
- return true;
-}
-
-static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
- unsigned len_mask)
-{
- const struct bio_vec *bvec = i->bvec;
- unsigned skip = i->iov_offset;
- size_t size = i->count;
-
- do {
- size_t len = bvec->bv_len - skip;
-
- if (len > size)
- len = size;
- if (len & len_mask)
- return false;
- if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
- return false;
-
- bvec++;
- size -= len;
- skip = 0;
- } while (size);
-
- return true;
-}
-
-/**
- * iov_iter_is_aligned() - Check if the addresses and lengths of each segments
- * are aligned to the parameters.
- *
- * @i: &struct iov_iter to restore
- * @addr_mask: bit mask to check against the iov element's addresses
- * @len_mask: bit mask to check against the iov element's lengths
- *
- * Return: false if any addresses or lengths intersect with the provided masks
- */
-bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
- unsigned len_mask)
-{
- if (likely(iter_is_ubuf(i))) {
- if (i->count & len_mask)
- return false;
- if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
- return false;
- return true;
- }
-
- if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
- return iov_iter_aligned_iovec(i, addr_mask, len_mask);
-
- if (iov_iter_is_bvec(i))
- return iov_iter_aligned_bvec(i, addr_mask, len_mask);
-
- /* With both xarray and folioq types, we're dealing with whole folios. */
- if (iov_iter_is_xarray(i)) {
- if (i->count & len_mask)
- return false;
- if ((i->xarray_start + i->iov_offset) & addr_mask)
- return false;
- }
- if (iov_iter_is_folioq(i)) {
- if (i->count & len_mask)
- return false;
- if (i->iov_offset & addr_mask)
- return false;
- }
-
- return true;
-}
-EXPORT_SYMBOL_GPL(iov_iter_is_aligned);
-
static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
{
const struct iovec *iov = iter_iov(i);
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index b7f2fa08d9c8..78e16b95d210 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -826,3 +826,23 @@ static int __init kobject_uevent_init(void)
postcore_initcall(kobject_uevent_init);
#endif
+
+#ifdef CONFIG_UEVENT_HELPER
+static const struct ctl_table uevent_helper_sysctl_table[] = {
+ {
+ .procname = "hotplug",
+ .data = &uevent_helper,
+ .maxlen = UEVENT_HELPER_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+};
+
+static int __init init_uevent_helper_sysctl(void)
+{
+ register_sysctl_init("kernel", uevent_helper_sysctl_table);
+ return 0;
+}
+
+postcore_initcall(init_uevent_helper_sysctl);
+#endif
diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig
index a97897edd964..7a6af361d2fc 100644
--- a/lib/kunit/Kconfig
+++ b/lib/kunit/Kconfig
@@ -93,4 +93,28 @@ config KUNIT_AUTORUN_ENABLED
In most cases this should be left as Y. Only if additional opt-in
behavior is needed should this be set to N.
+config KUNIT_DEFAULT_TIMEOUT
+ int "Default value of the timeout module parameter"
+ default 300
+ help
+ Sets the default timeout, in seconds, for Kunit test cases. This value
+ is further multiplied by a factor determined by the assigned speed
+ setting: 1x for `DEFAULT`, 3x for `KUNIT_SPEED_SLOW`, and 12x for
+ `KUNIT_SPEED_VERY_SLOW`. This allows slower tests on slower machines
+ sufficient time to complete.
+
+ If unsure, the default timeout of 300 seconds is suitable for most
+ cases.
+
+config KUNIT_UML_PCI
+ bool "KUnit UML PCI Support"
+ depends on UML
+ select UML_PCI
+ help
+ Enables the PCI subsystem on UML for use by KUnit tests.
+ Some KUnit tests require the PCI core which is not enabled by
+ default on UML.
+
+ If unsure, say N.
+
endif # KUNIT
diff --git a/lib/kunit/Makefile b/lib/kunit/Makefile
index 5aa51978e456..656f1fa35abc 100644
--- a/lib/kunit/Makefile
+++ b/lib/kunit/Makefile
@@ -17,7 +17,7 @@ kunit-objs += debugfs.o
endif
# KUnit 'hooks' are built-in even when KUnit is built as a module.
-obj-y += hooks.o
+obj-$(if $(CONFIG_KUNIT),y) += hooks.o
obj-$(CONFIG_KUNIT_TEST) += kunit-test.o
obj-$(CONFIG_KUNIT_TEST) += platform-test.o
diff --git a/lib/kunit/kunit-example-test.c b/lib/kunit/kunit-example-test.c
index 3056d6bc705d..9452b163956f 100644
--- a/lib/kunit/kunit-example-test.c
+++ b/lib/kunit/kunit-example-test.c
@@ -278,6 +278,218 @@ static void example_slow_test(struct kunit *test)
}
/*
+ * This custom function allocates memory and sets the information we want
+ * stored in the kunit_resource->data field.
+ */
+static int example_resource_init(struct kunit_resource *res, void *context)
+{
+ int *info = kmalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return -ENOMEM;
+ *info = *(int *)context;
+ res->data = info;
+ return 0;
+}
+
+/*
+ * This function deallocates memory for the kunit_resource->data field.
+ */
+static void example_resource_free(struct kunit_resource *res)
+{
+ kfree(res->data);
+}
+
+/*
+ * This match function is invoked by kunit_find_resource() to locate
+ * a test resource based on certain criteria.
+ */
+static bool example_resource_alloc_match(struct kunit *test,
+ struct kunit_resource *res,
+ void *match_data)
+{
+ return res->data && res->free == example_resource_free;
+}
+
+/*
+ * This is an example of a function that provides a description for each of the
+ * parameters in a parameterized test.
+ */
+static void example_param_array_get_desc(struct kunit *test, const void *p, char *desc)
+{
+ const struct example_param *param = p;
+
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE,
+ "example check if %d is less than or equal to 3", param->value);
+}
+
+/*
+ * This function gets passed in the parameterized test context i.e. the
+ * struct kunit belonging to the parameterized test. You can use this function
+ * to add resources you want shared across the whole parameterized test or
+ * for additional setup.
+ */
+static int example_param_init(struct kunit *test)
+{
+ int ctx = 3; /* Data to be stored. */
+ size_t arr_size = ARRAY_SIZE(example_params_array);
+
+ /*
+ * This allocates a struct kunit_resource, sets its data field to
+ * ctx, and adds it to the struct kunit's resources list. Note that
+ * this is parameterized test managed. So, it doesn't need to have
+ * a custom exit function to deallocation as it will get cleaned up at
+ * the end of the parameterized test.
+ */
+ void *data = kunit_alloc_resource(test, example_resource_init, example_resource_free,
+ GFP_KERNEL, &ctx);
+
+ if (!data)
+ return -ENOMEM;
+ /*
+ * Pass the parameter array information to the parameterized test context
+ * struct kunit. Note that you will need to provide kunit_array_gen_params()
+ * as the generator function to KUNIT_CASE_PARAM_WITH_INIT() when registering
+ * a parameter array this route.
+ */
+ kunit_register_params_array(test, example_params_array, arr_size,
+ example_param_array_get_desc);
+ return 0;
+}
+
+/*
+ * This is an example of a test that uses shared resources available in the
+ * parameterized test context.
+ */
+static void example_params_test_with_init(struct kunit *test)
+{
+ int threshold;
+ struct kunit_resource *res;
+ const struct example_param *param = test->param_value;
+
+ /* By design, param pointer will not be NULL. */
+ KUNIT_ASSERT_NOT_NULL(test, param);
+
+ /*
+ * Here we pass test->parent to search for shared resources in the
+ * parameterized test context.
+ */
+ res = kunit_find_resource(test->parent, example_resource_alloc_match, NULL);
+
+ KUNIT_ASSERT_NOT_NULL(test, res);
+
+ /* Since kunit_resource->data is a void pointer we need to typecast it. */
+ threshold = *((int *)res->data);
+
+ /* Assert that the parameter is less than or equal to a certain threshold. */
+ KUNIT_ASSERT_LE(test, param->value, threshold);
+
+ /* This decreases the reference count after calling kunit_find_resource(). */
+ kunit_put_resource(res);
+}
+
+/*
+ * Helper function to create a parameter array of Fibonacci numbers. This example
+ * highlights a parameter generation scenario that is:
+ * 1. Not feasible to fully pre-generate at compile time.
+ * 2. Challenging to implement with a standard generate_params() function,
+ * as it only provides the previous parameter, while Fibonacci requires
+ * access to two preceding values for calculation.
+ */
+static void *make_fibonacci_params(struct kunit *test, size_t seq_size)
+{
+ int *seq;
+
+ if (seq_size <= 0)
+ return NULL;
+ /*
+ * Using kunit_kmalloc_array here ties the lifetime of the array to
+ * the parameterized test i.e. it will get automatically cleaned up
+ * by KUnit after the parameterized test finishes.
+ */
+ seq = kunit_kmalloc_array(test, seq_size, sizeof(int), GFP_KERNEL);
+
+ if (!seq)
+ return NULL;
+ if (seq_size >= 1)
+ seq[0] = 0;
+ if (seq_size >= 2)
+ seq[1] = 1;
+ for (int i = 2; i < seq_size; i++)
+ seq[i] = seq[i - 1] + seq[i - 2];
+ return seq;
+}
+
+/*
+ * This is an example of a function that provides a description for each of the
+ * parameters.
+ */
+static void example_param_dynamic_arr_get_desc(struct kunit *test, const void *p, char *desc)
+{
+ const int *fib_num = p;
+
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "fibonacci param: %d", *fib_num);
+}
+
+/*
+ * Example of a parameterized test param_init() function that registers a dynamic
+ * array of parameters.
+ */
+static int example_param_init_dynamic_arr(struct kunit *test)
+{
+ size_t seq_size;
+ int *fibonacci_params;
+
+ kunit_info(test, "initializing parameterized test\n");
+
+ seq_size = 6;
+ fibonacci_params = make_fibonacci_params(test, seq_size);
+
+ if (!fibonacci_params)
+ return -ENOMEM;
+
+ /*
+ * Passes the dynamic parameter array information to the parameterized test
+ * context struct kunit. The array and its metadata will be stored in
+ * test->parent->params_array. The array itself will be located in
+ * params_data.params.
+ *
+ * Note that you will need to pass kunit_array_gen_params() as the
+ * generator function to KUNIT_CASE_PARAM_WITH_INIT() when registering
+ * a parameter array this route.
+ */
+ kunit_register_params_array(test, fibonacci_params, seq_size,
+ example_param_dynamic_arr_get_desc);
+ return 0;
+}
+
+/*
+ * Example of a parameterized test param_exit() function that outputs a log
+ * at the end of the parameterized test. It could also be used for any other
+ * teardown logic.
+ */
+static void example_param_exit_dynamic_arr(struct kunit *test)
+{
+ kunit_info(test, "exiting parameterized test\n");
+}
+
+/*
+ * Example of test that uses the registered dynamic array to perform assertions
+ * and expectations.
+ */
+static void example_params_test_with_init_dynamic_arr(struct kunit *test)
+{
+ const int *param = test->param_value;
+ int param_val;
+
+ /* By design, param pointer will not be NULL. */
+ KUNIT_ASSERT_NOT_NULL(test, param);
+
+ param_val = *param;
+ KUNIT_EXPECT_EQ(test, param_val - param_val, 0);
+}
+
+/*
* Here we make a list of all the test cases we want to add to the test suite
* below.
*/
@@ -296,6 +508,11 @@ static struct kunit_case example_test_cases[] = {
KUNIT_CASE(example_static_stub_using_fn_ptr_test),
KUNIT_CASE(example_priv_test),
KUNIT_CASE_PARAM(example_params_test, example_gen_params),
+ KUNIT_CASE_PARAM_WITH_INIT(example_params_test_with_init, kunit_array_gen_params,
+ example_param_init, NULL),
+ KUNIT_CASE_PARAM_WITH_INIT(example_params_test_with_init_dynamic_arr,
+ kunit_array_gen_params, example_param_init_dynamic_arr,
+ example_param_exit_dynamic_arr),
KUNIT_CASE_SLOW(example_slow_test),
{}
};
diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index d9c781c859fd..63130a48e237 100644
--- a/lib/kunit/kunit-test.c
+++ b/lib/kunit/kunit-test.c
@@ -8,6 +8,7 @@
#include "linux/gfp_types.h"
#include <kunit/test.h>
#include <kunit/test-bug.h>
+#include <kunit/static_stub.h>
#include <linux/device.h>
#include <kunit/device.h>
@@ -43,7 +44,8 @@ static void kunit_test_try_catch_successful_try_no_catch(struct kunit *test)
kunit_try_catch_init(try_catch,
test,
kunit_test_successful_try,
- kunit_test_no_catch);
+ kunit_test_no_catch,
+ 300 * msecs_to_jiffies(MSEC_PER_SEC));
kunit_try_catch_run(try_catch, test);
KUNIT_EXPECT_TRUE(test, ctx->function_called);
@@ -75,7 +77,8 @@ static void kunit_test_try_catch_unsuccessful_try_does_catch(struct kunit *test)
kunit_try_catch_init(try_catch,
test,
kunit_test_unsuccessful_try,
- kunit_test_catch);
+ kunit_test_catch,
+ 300 * msecs_to_jiffies(MSEC_PER_SEC));
kunit_try_catch_run(try_catch, test);
KUNIT_EXPECT_TRUE(test, ctx->function_called);
@@ -129,7 +132,8 @@ static void kunit_test_fault_null_dereference(struct kunit *test)
kunit_try_catch_init(try_catch,
test,
kunit_test_null_dereference,
- kunit_test_catch);
+ kunit_test_catch,
+ 300 * msecs_to_jiffies(MSEC_PER_SEC));
kunit_try_catch_run(try_catch, test);
KUNIT_EXPECT_EQ(test, try_catch->try_result, -EINTR);
@@ -735,7 +739,7 @@ static struct kunit_case kunit_current_test_cases[] = {
static void test_dev_action(void *priv)
{
- *(void **)priv = (void *)1;
+ *(long *)priv = 1;
}
static void kunit_device_test(struct kunit *test)
@@ -868,10 +872,53 @@ static struct kunit_suite kunit_current_test_suite = {
.test_cases = kunit_current_test_cases,
};
+static void kunit_stub_test(struct kunit *test)
+{
+ struct kunit fake_test;
+ const unsigned long fake_real_fn_addr = 0x1234;
+ const unsigned long fake_replacement_addr = 0x5678;
+ struct kunit_resource *res;
+ struct {
+ void *real_fn_addr;
+ void *replacement_addr;
+ } *stub_ctx;
+
+ kunit_init_test(&fake_test, "kunit_stub_fake_test", NULL);
+ KUNIT_ASSERT_EQ(test, fake_test.status, KUNIT_SUCCESS);
+ KUNIT_ASSERT_EQ(test, list_count_nodes(&fake_test.resources), 0);
+
+ __kunit_activate_static_stub(&fake_test, (void *)fake_real_fn_addr,
+ (void *)fake_replacement_addr);
+ KUNIT_ASSERT_EQ(test, fake_test.status, KUNIT_SUCCESS);
+ KUNIT_ASSERT_EQ(test, list_count_nodes(&fake_test.resources), 1);
+
+ res = list_first_entry(&fake_test.resources, struct kunit_resource, node);
+ KUNIT_EXPECT_NOT_NULL(test, res);
+
+ stub_ctx = res->data;
+ KUNIT_EXPECT_NOT_NULL(test, stub_ctx);
+ KUNIT_EXPECT_EQ(test, (unsigned long)stub_ctx->real_fn_addr, fake_real_fn_addr);
+ KUNIT_EXPECT_EQ(test, (unsigned long)stub_ctx->replacement_addr, fake_replacement_addr);
+
+ __kunit_activate_static_stub(&fake_test, (void *)fake_real_fn_addr, NULL);
+ KUNIT_ASSERT_EQ(test, fake_test.status, KUNIT_SUCCESS);
+ KUNIT_ASSERT_EQ(test, list_count_nodes(&fake_test.resources), 0);
+}
+
+static struct kunit_case kunit_stub_test_cases[] = {
+ KUNIT_CASE(kunit_stub_test),
+ {}
+};
+
+static struct kunit_suite kunit_stub_test_suite = {
+ .name = "kunit_stub",
+ .test_cases = kunit_stub_test_cases,
+};
+
kunit_test_suites(&kunit_try_catch_test_suite, &kunit_resource_test_suite,
&kunit_log_test_suite, &kunit_status_test_suite,
&kunit_current_test_suite, &kunit_device_test_suite,
- &kunit_fault_test_suite);
+ &kunit_fault_test_suite, &kunit_stub_test_suite);
MODULE_DESCRIPTION("KUnit test for core test infrastructure");
MODULE_LICENSE("GPL v2");
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index 146d1b48a096..62eb529824c6 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -70,6 +70,13 @@ module_param_named(enable, enable_param, bool, 0);
MODULE_PARM_DESC(enable, "Enable KUnit tests");
/*
+ * Configure the base timeout.
+ */
+static unsigned long kunit_base_timeout = CONFIG_KUNIT_DEFAULT_TIMEOUT;
+module_param_named(timeout, kunit_base_timeout, ulong, 0644);
+MODULE_PARM_DESC(timeout, "Set the base timeout for Kunit test cases");
+
+/*
* KUnit statistic mode:
* 0 - disabled
* 1 - only when there is more than one subtest
@@ -330,6 +337,14 @@ void __kunit_do_failed_assertion(struct kunit *test,
}
EXPORT_SYMBOL_GPL(__kunit_do_failed_assertion);
+static void kunit_init_params(struct kunit *test)
+{
+ test->params_array.params = NULL;
+ test->params_array.get_description = NULL;
+ test->params_array.num_params = 0;
+ test->params_array.elem_size = 0;
+}
+
void kunit_init_test(struct kunit *test, const char *name, struct string_stream *log)
{
spin_lock_init(&test->lock);
@@ -340,6 +355,7 @@ void kunit_init_test(struct kunit *test, const char *name, struct string_stream
string_stream_clear(log);
test->status = KUNIT_SUCCESS;
test->status_comment[0] = '\0';
+ kunit_init_params(test);
}
EXPORT_SYMBOL_GPL(kunit_init_test);
@@ -373,6 +389,40 @@ static void kunit_run_case_check_speed(struct kunit *test,
duration.tv_sec, duration.tv_nsec);
}
+/* Returns timeout multiplier based on speed.
+ * DEFAULT: 1
+ * KUNIT_SPEED_SLOW: 3
+ * KUNIT_SPEED_VERY_SLOW: 12
+ */
+static int kunit_timeout_mult(enum kunit_speed speed)
+{
+ switch (speed) {
+ case KUNIT_SPEED_SLOW:
+ return 3;
+ case KUNIT_SPEED_VERY_SLOW:
+ return 12;
+ default:
+ return 1;
+ }
+}
+
+static unsigned long kunit_test_timeout(struct kunit_suite *suite, struct kunit_case *test_case)
+{
+ int mult = 1;
+
+ /*
+ * The default test timeout is 300 seconds and will be adjusted by mult
+ * based on the test speed. The test speed will be overridden by the
+ * innermost test component.
+ */
+ if (suite->attr.speed != KUNIT_SPEED_UNSET)
+ mult = kunit_timeout_mult(suite->attr.speed);
+ if (test_case->attr.speed != KUNIT_SPEED_UNSET)
+ mult = kunit_timeout_mult(test_case->attr.speed);
+ return mult * kunit_base_timeout * msecs_to_jiffies(MSEC_PER_SEC);
+}
+
+
/*
* Initializes and runs test case. Does not clean up or do post validations.
*/
@@ -527,7 +577,8 @@ static void kunit_run_case_catch_errors(struct kunit_suite *suite,
kunit_try_catch_init(try_catch,
test,
kunit_try_run_case,
- kunit_catch_run_case);
+ kunit_catch_run_case,
+ kunit_test_timeout(suite, test_case));
context.test = test;
context.suite = suite;
context.test_case = test_case;
@@ -537,7 +588,8 @@ static void kunit_run_case_catch_errors(struct kunit_suite *suite,
kunit_try_catch_init(try_catch,
test,
kunit_try_run_case_cleanup,
- kunit_catch_run_case_cleanup);
+ kunit_catch_run_case_cleanup,
+ kunit_test_timeout(suite, test_case));
kunit_try_catch_run(try_catch, &context);
/* Propagate the parameter result to the test case. */
@@ -598,12 +650,44 @@ static void kunit_accumulate_stats(struct kunit_result_stats *total,
total->total += add.total;
}
+const void *kunit_array_gen_params(struct kunit *test, const void *prev, char *desc)
+{
+ struct kunit_params *params_arr = &test->params_array;
+ const void *param;
+
+ if (test->param_index < params_arr->num_params) {
+ param = (char *)params_arr->params
+ + test->param_index * params_arr->elem_size;
+
+ if (params_arr->get_description)
+ params_arr->get_description(test, param, desc);
+ return param;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(kunit_array_gen_params);
+
+static void kunit_init_parent_param_test(struct kunit_case *test_case, struct kunit *test)
+{
+ if (test_case->param_init) {
+ int err = test_case->param_init(test);
+
+ if (err) {
+ kunit_err(test_case, KUNIT_SUBTEST_INDENT KUNIT_SUBTEST_INDENT
+ "# failed to initialize parent parameter test (%d)", err);
+ test->status = KUNIT_FAILURE;
+ test_case->status = KUNIT_FAILURE;
+ }
+ }
+}
+
int kunit_run_tests(struct kunit_suite *suite)
{
char param_desc[KUNIT_PARAM_DESC_SIZE];
struct kunit_case *test_case;
struct kunit_result_stats suite_stats = { 0 };
struct kunit_result_stats total_stats = { 0 };
+ const void *curr_param;
/* Taint the kernel so we know we've run tests. */
add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
@@ -634,41 +718,65 @@ int kunit_run_tests(struct kunit_suite *suite)
kunit_run_case_catch_errors(suite, test_case, &test);
kunit_update_stats(&param_stats, test.status);
} else {
+ kunit_init_parent_param_test(test_case, &test);
+ if (test_case->status == KUNIT_FAILURE) {
+ kunit_update_stats(&param_stats, test.status);
+ goto test_case_end;
+ }
/* Get initial param. */
param_desc[0] = '\0';
- test.param_value = test_case->generate_params(NULL, param_desc);
+ /* TODO: Make generate_params try-catch */
+ curr_param = test_case->generate_params(&test, NULL, param_desc);
test_case->status = KUNIT_SKIPPED;
kunit_log(KERN_INFO, &test, KUNIT_SUBTEST_INDENT KUNIT_SUBTEST_INDENT
"KTAP version 1\n");
kunit_log(KERN_INFO, &test, KUNIT_SUBTEST_INDENT KUNIT_SUBTEST_INDENT
"# Subtest: %s", test_case->name);
+ if (test.params_array.params &&
+ test_case->generate_params == kunit_array_gen_params) {
+ kunit_log(KERN_INFO, &test, KUNIT_SUBTEST_INDENT
+ KUNIT_SUBTEST_INDENT "1..%zd\n",
+ test.params_array.num_params);
+ }
- while (test.param_value) {
- kunit_run_case_catch_errors(suite, test_case, &test);
+ while (curr_param) {
+ struct kunit param_test = {
+ .param_value = curr_param,
+ .param_index = ++test.param_index,
+ .parent = &test,
+ };
+ kunit_init_test(&param_test, test_case->name, NULL);
+ param_test.log = test_case->log;
+ kunit_run_case_catch_errors(suite, test_case, &param_test);
if (param_desc[0] == '\0') {
snprintf(param_desc, sizeof(param_desc),
- "param-%d", test.param_index);
+ "param-%d", param_test.param_index);
}
- kunit_print_ok_not_ok(&test, KUNIT_LEVEL_CASE_PARAM,
- test.status,
- test.param_index + 1,
+ kunit_print_ok_not_ok(&param_test, KUNIT_LEVEL_CASE_PARAM,
+ param_test.status,
+ param_test.param_index,
param_desc,
- test.status_comment);
+ param_test.status_comment);
- kunit_update_stats(&param_stats, test.status);
+ kunit_update_stats(&param_stats, param_test.status);
/* Get next param. */
param_desc[0] = '\0';
- test.param_value = test_case->generate_params(test.param_value, param_desc);
- test.param_index++;
- test.status = KUNIT_SUCCESS;
- test.status_comment[0] = '\0';
- test.priv = NULL;
+ curr_param = test_case->generate_params(&test, curr_param,
+ param_desc);
}
+ /*
+ * TODO: Put into a try catch. Since we don't need suite->exit
+ * for it we can't reuse kunit_try_run_cleanup for this yet.
+ */
+ if (test_case->param_exit)
+ test_case->param_exit(&test);
+ /* TODO: Put this kunit_cleanup into a try-catch. */
+ kunit_cleanup(&test);
}
-
+test_case_end:
kunit_print_attr((void *)test_case, true, KUNIT_LEVEL_CASE);
kunit_print_test_stats(&test, param_stats);
@@ -759,7 +867,6 @@ void __kunit_test_suites_exit(struct kunit_suite **suites, int num_suites)
}
EXPORT_SYMBOL_GPL(__kunit_test_suites_exit);
-#ifdef CONFIG_MODULES
static void kunit_module_init(struct module *mod)
{
struct kunit_suite_set suite_set, filtered_set;
@@ -847,7 +954,6 @@ static struct notifier_block kunit_mod_nb = {
.notifier_call = kunit_module_notify,
.priority = 0,
};
-#endif
KUNIT_DEFINE_ACTION_WRAPPER(kfree_action_wrapper, kfree, const void *)
@@ -938,20 +1044,14 @@ static int __init kunit_init(void)
kunit_debugfs_init();
kunit_bus_init();
-#ifdef CONFIG_MODULES
return register_module_notifier(&kunit_mod_nb);
-#else
- return 0;
-#endif
}
late_initcall(kunit_init);
static void __exit kunit_exit(void)
{
memset(&kunit_hooks, 0, sizeof(kunit_hooks));
-#ifdef CONFIG_MODULES
unregister_module_notifier(&kunit_mod_nb);
-#endif
kunit_bus_shutdown();
diff --git a/lib/kunit/try-catch-impl.h b/lib/kunit/try-catch-impl.h
index 203ba6a5e740..6f401b97cd0b 100644
--- a/lib/kunit/try-catch-impl.h
+++ b/lib/kunit/try-catch-impl.h
@@ -17,11 +17,13 @@ struct kunit;
static inline void kunit_try_catch_init(struct kunit_try_catch *try_catch,
struct kunit *test,
kunit_try_catch_func_t try,
- kunit_try_catch_func_t catch)
+ kunit_try_catch_func_t catch,
+ unsigned long timeout)
{
try_catch->test = test;
try_catch->try = try;
try_catch->catch = catch;
+ try_catch->timeout = timeout;
}
#endif /* _KUNIT_TRY_CATCH_IMPL_H */
diff --git a/lib/kunit/try-catch.c b/lib/kunit/try-catch.c
index 6bbe0025b079..d84a879f0a78 100644
--- a/lib/kunit/try-catch.c
+++ b/lib/kunit/try-catch.c
@@ -34,31 +34,6 @@ static int kunit_generic_run_threadfn_adapter(void *data)
return 0;
}
-static unsigned long kunit_test_timeout(void)
-{
- /*
- * TODO(brendanhiggins@google.com): We should probably have some type of
- * variable timeout here. The only question is what that timeout value
- * should be.
- *
- * The intention has always been, at some point, to be able to label
- * tests with some type of size bucket (unit/small, integration/medium,
- * large/system/end-to-end, etc), where each size bucket would get a
- * default timeout value kind of like what Bazel does:
- * https://docs.bazel.build/versions/master/be/common-definitions.html#test.size
- * There is still some debate to be had on exactly how we do this. (For
- * one, we probably want to have some sort of test runner level
- * timeout.)
- *
- * For more background on this topic, see:
- * https://mike-bland.com/2011/11/01/small-medium-large.html
- *
- * If tests timeout due to exceeding sysctl_hung_task_timeout_secs,
- * the task will be killed and an oops generated.
- */
- return 300 * msecs_to_jiffies(MSEC_PER_SEC); /* 5 min */
-}
-
void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context)
{
struct kunit *test = try_catch->test;
@@ -85,8 +60,8 @@ void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context)
task_done = task_struct->vfork_done;
wake_up_process(task_struct);
- time_remaining = wait_for_completion_timeout(task_done,
- kunit_test_timeout());
+ time_remaining = wait_for_completion_timeout(
+ task_done, try_catch->timeout);
if (time_remaining == 0) {
try_catch->try_result = -ETIMEDOUT;
kthread_stop(task_struct);
diff --git a/lib/kunit/user_alloc.c b/lib/kunit/user_alloc.c
index 46951be018be..b8cac765e620 100644
--- a/lib/kunit/user_alloc.c
+++ b/lib/kunit/user_alloc.c
@@ -22,8 +22,7 @@ struct kunit_vm_mmap_params {
unsigned long offset;
};
-/* Create and attach a new mm if it doesn't already exist. */
-static int kunit_attach_mm(void)
+int kunit_attach_mm(void)
{
struct mm_struct *mm;
@@ -49,6 +48,7 @@ static int kunit_attach_mm(void)
return 0;
}
+EXPORT_SYMBOL_GPL(kunit_attach_mm);
static int kunit_vm_mmap_init(struct kunit_resource *res, void *context)
{
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index ed99344317f5..d939403331b5 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -202,7 +202,7 @@ static void init_shared_classes(void)
local_irq_disable(); \
__irq_enter(); \
lockdep_hardirq_threaded(); \
- WARN_ON(!in_irq());
+ WARN_ON(!in_hardirq());
#define HARDIRQ_EXIT() \
__irq_exit(); \
@@ -2512,7 +2512,7 @@ DEFINE_LOCK_GUARD_0(NOTTHREADED_HARDIRQ,
do {
local_irq_disable();
__irq_enter();
- WARN_ON(!in_irq());
+ WARN_ON(!in_hardirq());
} while(0), HARDIRQ_EXIT())
DEFINE_LOCK_GUARD_0(SOFTIRQ, SOFTIRQ_ENTER(), SOFTIRQ_EXIT())
diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c
index 7b10ca86a893..01586af2347f 100644
--- a/lib/lzo/lzo1x_compress.c
+++ b/lib/lzo/lzo1x_compress.c
@@ -26,7 +26,7 @@
#define HAVE_OP(x) 1
#endif
-#define NEED_OP(x) if (!HAVE_OP(x)) goto output_overrun
+#define NEED_OP(x) if (unlikely(!HAVE_OP(x))) goto output_overrun
static noinline int
LZO_SAFE(lzo1x_1_do_compress)(const unsigned char *in, size_t in_len,
diff --git a/lib/lzo/lzo1x_decompress_safe.c b/lib/lzo/lzo1x_decompress_safe.c
index c94f4928e188..318abb82c63d 100644
--- a/lib/lzo/lzo1x_decompress_safe.c
+++ b/lib/lzo/lzo1x_decompress_safe.c
@@ -22,9 +22,9 @@
#define HAVE_IP(x) ((size_t)(ip_end - ip) >= (size_t)(x))
#define HAVE_OP(x) ((size_t)(op_end - op) >= (size_t)(x))
-#define NEED_IP(x) if (!HAVE_IP(x)) goto input_overrun
-#define NEED_OP(x) if (!HAVE_OP(x)) goto output_overrun
-#define TEST_LB(m_pos) if ((m_pos) < out) goto lookbehind_overrun
+#define NEED_IP(x) if (unlikely(!HAVE_IP(x))) goto input_overrun
+#define NEED_OP(x) if (unlikely(!HAVE_OP(x))) goto output_overrun
+#define TEST_LB(m_pos) if (unlikely((m_pos) < out)) goto lookbehind_overrun
/* This MAX_255_COUNT is the maximum number of times we can add 255 to a base
* count without overflowing an integer. The multiply will overflow when
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index affe979bd14d..5aa4c9500018 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -64,6 +64,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/maple_tree.h>
+#define TP_FCT tracepoint_string(__func__)
+
/*
* Kernel pointer hashing renders much of the maple tree dump useless as tagged
* pointers get hashed to arbitrary values.
@@ -83,13 +85,9 @@
/*
* Maple state flags
- * * MA_STATE_BULK - Bulk insert mode
- * * MA_STATE_REBALANCE - Indicate a rebalance during bulk insert
* * MA_STATE_PREALLOC - Preallocated nodes, WARN_ON allocation
*/
-#define MA_STATE_BULK 1
-#define MA_STATE_REBALANCE 2
-#define MA_STATE_PREALLOC 4
+#define MA_STATE_PREALLOC 1
#define ma_parent_ptr(x) ((struct maple_pnode *)(x))
#define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT)
@@ -176,26 +174,25 @@ static inline struct maple_node *mt_alloc_one(gfp_t gfp)
return kmem_cache_alloc(maple_node_cache, gfp);
}
-static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes)
+static inline void mt_free_bulk(size_t size, void __rcu **nodes)
{
- return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes);
+ kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes);
}
-static inline void mt_free_one(struct maple_node *node)
+static void mt_return_sheaf(struct slab_sheaf *sheaf)
{
- kmem_cache_free(maple_node_cache, node);
+ kmem_cache_return_sheaf(maple_node_cache, GFP_NOWAIT, sheaf);
}
-static inline void mt_free_bulk(size_t size, void __rcu **nodes)
+static struct slab_sheaf *mt_get_sheaf(gfp_t gfp, int count)
{
- kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes);
+ return kmem_cache_prefill_sheaf(maple_node_cache, gfp, count);
}
-static void mt_free_rcu(struct rcu_head *head)
+static int mt_refill_sheaf(gfp_t gfp, struct slab_sheaf **sheaf,
+ unsigned int size)
{
- struct maple_node *node = container_of(head, struct maple_node, rcu);
-
- kmem_cache_free(maple_node_cache, node);
+ return kmem_cache_refill_sheaf(maple_node_cache, gfp, sheaf, size);
}
/*
@@ -208,7 +205,7 @@ static void mt_free_rcu(struct rcu_head *head)
static void ma_free_rcu(struct maple_node *node)
{
WARN_ON(node->parent != ma_parent_ptr(node));
- call_rcu(&node->rcu, mt_free_rcu);
+ kfree_rcu(node, rcu);
}
static void mt_set_height(struct maple_tree *mt, unsigned char height)
@@ -405,11 +402,11 @@ static __always_inline bool mt_is_alloc(struct maple_tree *mt)
* a reuse of the last bit in the node type. This is possible by using bit 1 to
* indicate if bit 2 is part of the type or the slot.
*
- * Note types:
- * 0x??1 = Root
- * 0x?00 = 16 bit nodes
- * 0x010 = 32 bit nodes
- * 0x110 = 64 bit nodes
+ * Node types:
+ * 0b??1 = Root
+ * 0b?00 = 16 bit nodes
+ * 0b010 = 32 bit nodes
+ * 0b110 = 64 bit nodes
*
* Slot size and alignment
* 0b??1 : Root
@@ -427,7 +424,7 @@ static __always_inline bool mt_is_alloc(struct maple_tree *mt)
#define MAPLE_PARENT_16B_SLOT_MASK 0xFC
#define MAPLE_PARENT_RANGE64 0x06
-#define MAPLE_PARENT_RANGE32 0x04
+#define MAPLE_PARENT_RANGE32 0x02
#define MAPLE_PARENT_NOT_RANGE16 0x02
/*
@@ -591,67 +588,6 @@ static __always_inline bool mte_dead_node(const struct maple_enode *enode)
}
/*
- * mas_allocated() - Get the number of nodes allocated in a maple state.
- * @mas: The maple state
- *
- * The ma_state alloc member is overloaded to hold a pointer to the first
- * allocated node or to the number of requested nodes to allocate. If bit 0 is
- * set, then the alloc contains the number of requested nodes. If there is an
- * allocated node, then the total allocated nodes is in that node.
- *
- * Return: The total number of nodes allocated
- */
-static inline unsigned long mas_allocated(const struct ma_state *mas)
-{
- if (!mas->alloc || ((unsigned long)mas->alloc & 0x1))
- return 0;
-
- return mas->alloc->total;
-}
-
-/*
- * mas_set_alloc_req() - Set the requested number of allocations.
- * @mas: the maple state
- * @count: the number of allocations.
- *
- * The requested number of allocations is either in the first allocated node,
- * located in @mas->alloc->request_count, or directly in @mas->alloc if there is
- * no allocated node. Set the request either in the node or do the necessary
- * encoding to store in @mas->alloc directly.
- */
-static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count)
-{
- if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) {
- if (!count)
- mas->alloc = NULL;
- else
- mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U);
- return;
- }
-
- mas->alloc->request_count = count;
-}
-
-/*
- * mas_alloc_req() - get the requested number of allocations.
- * @mas: The maple state
- *
- * The alloc count is either stored directly in @mas, or in
- * @mas->alloc->request_count if there is at least one node allocated. Decode
- * the request count if it's stored directly in @mas->alloc.
- *
- * Return: The allocation request count.
- */
-static inline unsigned int mas_alloc_req(const struct ma_state *mas)
-{
- if ((unsigned long)mas->alloc & 0x1)
- return (unsigned long)(mas->alloc) >> 1;
- else if (mas->alloc)
- return mas->alloc->request_count;
- return 0;
-}
-
-/*
* ma_pivots() - Get a pointer to the maple node pivots.
* @node: the maple node
* @type: the node type
@@ -1032,28 +968,10 @@ static inline void mas_descend(struct ma_state *mas)
}
/*
- * mte_set_gap() - Set a maple node gap.
- * @mn: The encoded maple node
- * @gap: The offset of the gap to set
- * @val: The gap value
- */
-static inline void mte_set_gap(const struct maple_enode *mn,
- unsigned char gap, unsigned long val)
-{
- switch (mte_node_type(mn)) {
- default:
- break;
- case maple_arange_64:
- mte_to_node(mn)->ma64.gap[gap] = val;
- break;
- }
-}
-
-/*
* mas_ascend() - Walk up a level of the tree.
* @mas: The maple state
*
- * Sets the @mas->max and @mas->min to the correct values when walking up. This
+ * Sets the @mas->max and @mas->min for the parent node of mas->node. This
* may cause several levels of walking up to find the correct min and max.
* May find a dead node which will cause a premature return.
* Return: 1 on dead node, 0 otherwise
@@ -1098,6 +1016,12 @@ static int mas_ascend(struct ma_state *mas)
min = 0;
max = ULONG_MAX;
+
+ /*
+ * !mas->offset implies that parent node min == mas->min.
+ * mas->offset > 0 implies that we need to walk up to find the
+ * implied pivot min.
+ */
if (!mas->offset) {
min = mas->min;
set_min = true;
@@ -1146,79 +1070,24 @@ static int mas_ascend(struct ma_state *mas)
*
* Return: A pointer to a maple node.
*/
-static inline struct maple_node *mas_pop_node(struct ma_state *mas)
+static __always_inline struct maple_node *mas_pop_node(struct ma_state *mas)
{
- struct maple_alloc *ret, *node = mas->alloc;
- unsigned long total = mas_allocated(mas);
- unsigned int req = mas_alloc_req(mas);
-
- /* nothing or a request pending. */
- if (WARN_ON(!total))
- return NULL;
+ struct maple_node *ret;
- if (total == 1) {
- /* single allocation in this ma_state */
+ if (mas->alloc) {
+ ret = mas->alloc;
mas->alloc = NULL;
- ret = node;
- goto single_node;
+ goto out;
}
- if (node->node_count == 1) {
- /* Single allocation in this node. */
- mas->alloc = node->slot[0];
- mas->alloc->total = node->total - 1;
- ret = node;
- goto new_head;
- }
- node->total--;
- ret = node->slot[--node->node_count];
- node->slot[node->node_count] = NULL;
+ if (WARN_ON_ONCE(!mas->sheaf))
+ return NULL;
-single_node:
-new_head:
- if (req) {
- req++;
- mas_set_alloc_req(mas, req);
- }
+ ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf);
+out:
memset(ret, 0, sizeof(*ret));
- return (struct maple_node *)ret;
-}
-
-/*
- * mas_push_node() - Push a node back on the maple state allocation.
- * @mas: The maple state
- * @used: The used maple node
- *
- * Stores the maple node back into @mas->alloc for reuse. Updates allocated and
- * requested node count as necessary.
- */
-static inline void mas_push_node(struct ma_state *mas, struct maple_node *used)
-{
- struct maple_alloc *reuse = (struct maple_alloc *)used;
- struct maple_alloc *head = mas->alloc;
- unsigned long count;
- unsigned int requested = mas_alloc_req(mas);
-
- count = mas_allocated(mas);
-
- reuse->request_count = 0;
- reuse->node_count = 0;
- if (count) {
- if (head->node_count < MAPLE_ALLOC_SLOTS) {
- head->slot[head->node_count++] = reuse;
- head->total++;
- goto done;
- }
- reuse->slot[0] = head;
- reuse->node_count = 1;
- }
-
- reuse->total = count + 1;
- mas->alloc = reuse;
-done:
- if (requested > 1)
- mas_set_alloc_req(mas, requested - 1);
+ return ret;
}
/*
@@ -1228,121 +1097,81 @@ done:
*/
static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
{
- struct maple_alloc *node;
- unsigned long allocated = mas_allocated(mas);
- unsigned int requested = mas_alloc_req(mas);
- unsigned int count;
- void **slots = NULL;
- unsigned int max_req = 0;
-
- if (!requested)
+ if (!mas->node_request)
return;
- mas_set_alloc_req(mas, 0);
- if (mas->mas_flags & MA_STATE_PREALLOC) {
- if (allocated)
+ if (mas->node_request == 1) {
+ if (mas->sheaf)
+ goto use_sheaf;
+
+ if (mas->alloc)
return;
- WARN_ON(!allocated);
- }
- if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) {
- node = (struct maple_alloc *)mt_alloc_one(gfp);
- if (!node)
- goto nomem_one;
+ mas->alloc = mt_alloc_one(gfp);
+ if (!mas->alloc)
+ goto error;
- if (allocated) {
- node->slot[0] = mas->alloc;
- node->node_count = 1;
- } else {
- node->node_count = 0;
- }
+ mas->node_request = 0;
+ return;
+ }
- mas->alloc = node;
- node->total = ++allocated;
- node->request_count = 0;
- requested--;
+use_sheaf:
+ if (unlikely(mas->alloc)) {
+ kfree(mas->alloc);
+ mas->alloc = NULL;
}
- node = mas->alloc;
- while (requested) {
- max_req = MAPLE_ALLOC_SLOTS - node->node_count;
- slots = (void **)&node->slot[node->node_count];
- max_req = min(requested, max_req);
- count = mt_alloc_bulk(gfp, max_req, slots);
- if (!count)
- goto nomem_bulk;
+ if (mas->sheaf) {
+ unsigned long refill;
- if (node->node_count == 0) {
- node->slot[0]->node_count = 0;
- node->slot[0]->request_count = 0;
+ refill = mas->node_request;
+ if (kmem_cache_sheaf_size(mas->sheaf) >= refill) {
+ mas->node_request = 0;
+ return;
}
- node->node_count += count;
- allocated += count;
- /* find a non-full node*/
- do {
- node = node->slot[0];
- } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS));
- requested -= count;
- }
- mas->alloc->total = allocated;
- return;
+ if (mt_refill_sheaf(gfp, &mas->sheaf, refill))
+ goto error;
-nomem_bulk:
- /* Clean up potential freed allocations on bulk failure */
- memset(slots, 0, max_req * sizeof(unsigned long));
- mas->alloc->total = allocated;
-nomem_one:
- mas_set_alloc_req(mas, requested);
- mas_set_err(mas, -ENOMEM);
-}
+ mas->node_request = 0;
+ return;
+ }
-/*
- * mas_free() - Free an encoded maple node
- * @mas: The maple state
- * @used: The encoded maple node to free.
- *
- * Uses rcu free if necessary, pushes @used back on the maple state allocations
- * otherwise.
- */
-static inline void mas_free(struct ma_state *mas, struct maple_enode *used)
-{
- struct maple_node *tmp = mte_to_node(used);
+ mas->sheaf = mt_get_sheaf(gfp, mas->node_request);
+ if (likely(mas->sheaf)) {
+ mas->node_request = 0;
+ return;
+ }
- if (mt_in_rcu(mas->tree))
- ma_free_rcu(tmp);
- else
- mas_push_node(mas, tmp);
+error:
+ mas_set_err(mas, -ENOMEM);
}
-/*
- * mas_node_count_gfp() - Check if enough nodes are allocated and request more
- * if there is not enough nodes.
- * @mas: The maple state
- * @count: The number of nodes needed
- * @gfp: the gfp flags
- */
-static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp)
+static inline void mas_empty_nodes(struct ma_state *mas)
{
- unsigned long allocated = mas_allocated(mas);
+ mas->node_request = 0;
+ if (mas->sheaf) {
+ mt_return_sheaf(mas->sheaf);
+ mas->sheaf = NULL;
+ }
- if (allocated < count) {
- mas_set_alloc_req(mas, count - allocated);
- mas_alloc_nodes(mas, gfp);
+ if (mas->alloc) {
+ kfree(mas->alloc);
+ mas->alloc = NULL;
}
}
/*
- * mas_node_count() - Check if enough nodes are allocated and request more if
- * there is not enough nodes.
+ * mas_free() - Free an encoded maple node
* @mas: The maple state
- * @count: The number of nodes needed
+ * @used: The encoded maple node to free.
*
- * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags.
+ * Uses rcu free if necessary, pushes @used back on the maple state allocations
+ * otherwise.
*/
-static void mas_node_count(struct ma_state *mas, int count)
+static inline void mas_free(struct ma_state *mas, struct maple_enode *used)
{
- return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN);
+ ma_free_rcu(mte_to_node(used));
}
/*
@@ -1872,21 +1701,7 @@ static inline int mab_calc_split(struct ma_state *mas,
* end on a NULL entry, with the exception of the left-most leaf. The
* limitation means that the split of a node must be checked for this condition
* and be able to put more data in one direction or the other.
- */
- if (unlikely((mas->mas_flags & MA_STATE_BULK))) {
- *mid_split = 0;
- split = b_end - mt_min_slots[bn->type];
-
- if (!ma_is_leaf(bn->type))
- return split;
-
- mas->mas_flags |= MA_STATE_REBALANCE;
- if (!bn->slot[split])
- split--;
- return split;
- }
-
- /*
+ *
* Although extremely rare, it is possible to enter what is known as the 3-way
* split scenario. The 3-way split comes about by means of a store of a range
* that overwrites the end and beginning of two full nodes. The result is a set
@@ -2034,27 +1849,6 @@ static inline void mab_mas_cp(struct maple_big_node *b_node,
}
/*
- * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert.
- * @mas: The maple state
- * @end: The maple node end
- * @mt: The maple node type
- */
-static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end,
- enum maple_type mt)
-{
- if (!(mas->mas_flags & MA_STATE_BULK))
- return;
-
- if (mte_is_root(mas->node))
- return;
-
- if (end > mt_min_slots[mt]) {
- mas->mas_flags &= ~MA_STATE_REBALANCE;
- return;
- }
-}
-
-/*
* mas_store_b_node() - Store an @entry into the b_node while also copying the
* data from a maple encoded node.
* @wr_mas: the maple write state
@@ -2103,9 +1897,6 @@ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas,
/* Handle new range ending before old range ends */
piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type);
if (piv > mas->last) {
- if (piv == ULONG_MAX)
- mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type);
-
if (offset_end != slot)
wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
offset_end);
@@ -2517,10 +2308,7 @@ static inline void mas_topiary_node(struct ma_state *mas,
enode = tmp_mas->node;
tmp = mte_to_node(enode);
mte_set_node_dead(enode);
- if (in_rcu)
- ma_free_rcu(tmp);
- else
- mas_push_node(mas, tmp);
+ ma_free_rcu(tmp);
}
/*
@@ -2970,7 +2758,7 @@ static inline void mas_rebalance(struct ma_state *mas,
MA_STATE(l_mas, mas->tree, mas->index, mas->last);
MA_STATE(r_mas, mas->tree, mas->index, mas->last);
- trace_ma_op(__func__, mas);
+ trace_ma_op(TP_FCT, mas);
/*
* Rebalancing occurs if a node is insufficient. Data is rebalanced
@@ -3006,126 +2794,6 @@ static inline void mas_rebalance(struct ma_state *mas,
}
/*
- * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple
- * state.
- * @mas: The maple state
- * @end: The end of the left-most node.
- *
- * During a mass-insert event (such as forking), it may be necessary to
- * rebalance the left-most node when it is not sufficient.
- */
-static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end)
-{
- enum maple_type mt = mte_node_type(mas->node);
- struct maple_node reuse, *newnode, *parent, *new_left, *left, *node;
- struct maple_enode *eparent, *old_eparent;
- unsigned char offset, tmp, split = mt_slots[mt] / 2;
- void __rcu **l_slots, **slots;
- unsigned long *l_pivs, *pivs, gap;
- bool in_rcu = mt_in_rcu(mas->tree);
- unsigned char new_height = mas_mt_height(mas);
-
- MA_STATE(l_mas, mas->tree, mas->index, mas->last);
-
- l_mas = *mas;
- mas_prev_sibling(&l_mas);
-
- /* set up node. */
- if (in_rcu) {
- newnode = mas_pop_node(mas);
- } else {
- newnode = &reuse;
- }
-
- node = mas_mn(mas);
- newnode->parent = node->parent;
- slots = ma_slots(newnode, mt);
- pivs = ma_pivots(newnode, mt);
- left = mas_mn(&l_mas);
- l_slots = ma_slots(left, mt);
- l_pivs = ma_pivots(left, mt);
- if (!l_slots[split])
- split++;
- tmp = mas_data_end(&l_mas) - split;
-
- memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp);
- memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp);
- pivs[tmp] = l_mas.max;
- memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end);
- memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end);
-
- l_mas.max = l_pivs[split];
- mas->min = l_mas.max + 1;
- old_eparent = mt_mk_node(mte_parent(l_mas.node),
- mas_parent_type(&l_mas, l_mas.node));
- tmp += end;
- if (!in_rcu) {
- unsigned char max_p = mt_pivots[mt];
- unsigned char max_s = mt_slots[mt];
-
- if (tmp < max_p)
- memset(pivs + tmp, 0,
- sizeof(unsigned long) * (max_p - tmp));
-
- if (tmp < mt_slots[mt])
- memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp));
-
- memcpy(node, newnode, sizeof(struct maple_node));
- ma_set_meta(node, mt, 0, tmp - 1);
- mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node),
- l_pivs[split]);
-
- /* Remove data from l_pivs. */
- tmp = split + 1;
- memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp));
- memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp));
- ma_set_meta(left, mt, 0, split);
- eparent = old_eparent;
-
- goto done;
- }
-
- /* RCU requires replacing both l_mas, mas, and parent. */
- mas->node = mt_mk_node(newnode, mt);
- ma_set_meta(newnode, mt, 0, tmp);
-
- new_left = mas_pop_node(mas);
- new_left->parent = left->parent;
- mt = mte_node_type(l_mas.node);
- slots = ma_slots(new_left, mt);
- pivs = ma_pivots(new_left, mt);
- memcpy(slots, l_slots, sizeof(void *) * split);
- memcpy(pivs, l_pivs, sizeof(unsigned long) * split);
- ma_set_meta(new_left, mt, 0, split);
- l_mas.node = mt_mk_node(new_left, mt);
-
- /* replace parent. */
- offset = mte_parent_slot(mas->node);
- mt = mas_parent_type(&l_mas, l_mas.node);
- parent = mas_pop_node(mas);
- slots = ma_slots(parent, mt);
- pivs = ma_pivots(parent, mt);
- memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node));
- rcu_assign_pointer(slots[offset], mas->node);
- rcu_assign_pointer(slots[offset - 1], l_mas.node);
- pivs[offset - 1] = l_mas.max;
- eparent = mt_mk_node(parent, mt);
-done:
- gap = mas_leaf_max_gap(mas);
- mte_set_gap(eparent, mte_parent_slot(mas->node), gap);
- gap = mas_leaf_max_gap(&l_mas);
- mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap);
- mas_ascend(mas);
-
- if (in_rcu) {
- mas_replace_node(mas, old_eparent, new_height);
- mas_adopt_children(mas, mas->node);
- }
-
- mas_update_gap(mas);
-}
-
-/*
* mas_split_final_node() - Split the final node in a subtree operation.
* @mast: the maple subtree state
* @mas: The maple state
@@ -3331,7 +2999,7 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node)
MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last);
MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last);
- trace_ma_op(__func__, mas);
+ trace_ma_op(TP_FCT, mas);
mast.l = &l_mas;
mast.r = &r_mas;
@@ -3506,7 +3174,7 @@ static bool mas_is_span_wr(struct ma_wr_state *wr_mas)
return false;
}
- trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry);
+ trace_ma_write(TP_FCT, wr_mas->mas, wr_mas->r_max, entry);
return true;
}
@@ -3750,7 +3418,7 @@ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas)
* of data may happen.
*/
mas = wr_mas->mas;
- trace_ma_op(__func__, mas);
+ trace_ma_op(TP_FCT, mas);
if (unlikely(!mas->index && mas->last == ULONG_MAX))
return mas_new_root(mas, wr_mas->entry);
@@ -3831,8 +3499,6 @@ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas,
if (mas->last == wr_mas->end_piv)
offset_end++; /* don't copy this offset */
- else if (unlikely(wr_mas->r_max == ULONG_MAX))
- mas_bulk_rebalance(mas, mas->end, wr_mas->type);
/* set up node. */
if (in_rcu) {
@@ -3888,7 +3554,7 @@ done:
} else {
memcpy(wr_mas->node, newnode, sizeof(struct maple_node));
}
- trace_ma_write(__func__, mas, 0, wr_mas->entry);
+ trace_ma_write(TP_FCT, mas, 0, wr_mas->entry);
mas_update_gap(mas);
mas->end = new_end;
return;
@@ -3932,7 +3598,7 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas)
mas->offset++; /* Keep mas accurate. */
}
- trace_ma_write(__func__, mas, 0, wr_mas->entry);
+ trace_ma_write(TP_FCT, mas, 0, wr_mas->entry);
/*
* Only update gap when the new entry is empty or there is an empty
* entry in the original two ranges.
@@ -4053,7 +3719,7 @@ static inline void mas_wr_append(struct ma_wr_state *wr_mas,
mas_update_gap(mas);
mas->end = new_end;
- trace_ma_write(__func__, mas, new_end, wr_mas->entry);
+ trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry);
return;
}
@@ -4067,7 +3733,7 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas)
{
struct maple_big_node b_node;
- trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry);
+ trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry);
memset(&b_node, 0, sizeof(struct maple_big_node));
mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end);
mas_commit_b_node(wr_mas, &b_node);
@@ -4168,7 +3834,7 @@ set_content:
*
* Return: Number of nodes required for preallocation.
*/
-static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry)
+static inline void mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry)
{
struct ma_state *mas = wr_mas->mas;
unsigned char height = mas_mt_height(mas);
@@ -4214,7 +3880,7 @@ static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry)
WARN_ON_ONCE(1);
}
- return ret;
+ mas->node_request = ret;
}
/*
@@ -4249,7 +3915,7 @@ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas)
new_end = mas_wr_new_end(wr_mas);
/* Potential spanning rebalance collapsing a node */
if (new_end < mt_min_slots[wr_mas->type]) {
- if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK))
+ if (!mte_is_root(mas->node))
return wr_rebalance;
return wr_node_store;
}
@@ -4275,15 +3941,15 @@ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas)
*/
static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry)
{
- int request;
+ struct ma_state *mas = wr_mas->mas;
mas_wr_prealloc_setup(wr_mas);
- wr_mas->mas->store_type = mas_wr_store_type(wr_mas);
- request = mas_prealloc_calc(wr_mas, entry);
- if (!request)
+ mas->store_type = mas_wr_store_type(wr_mas);
+ mas_prealloc_calc(wr_mas, entry);
+ if (!mas->node_request)
return;
- mas_node_count(wr_mas->mas, request);
+ mas_alloc_nodes(mas, GFP_NOWAIT);
}
/**
@@ -4560,15 +4226,12 @@ again:
if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
goto retry;
-
if (likely(entry))
return entry;
if (!empty) {
- if (mas->index <= min) {
- mas->status = ma_underflow;
- return NULL;
- }
+ if (mas->index <= min)
+ goto underflow;
goto again;
}
@@ -4930,7 +4593,7 @@ void *mas_walk(struct ma_state *mas)
{
void *entry;
- if (!mas_is_active(mas) || !mas_is_start(mas))
+ if (!mas_is_active(mas) && !mas_is_start(mas))
mas->status = ma_start;
retry:
entry = mas_state_walk(mas);
@@ -5278,7 +4941,7 @@ static void mt_free_walk(struct rcu_head *head)
mt_free_bulk(node->slot_len, slots);
free_leaf:
- mt_free_rcu(&node->rcu);
+ kfree(node);
}
static inline void __rcu **mte_destroy_descend(struct maple_enode **enode,
@@ -5319,6 +4982,7 @@ static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
struct maple_enode *start;
if (mte_is_leaf(enode)) {
+ mte_set_node_dead(enode);
node->type = mte_node_type(enode);
goto free_leaf;
}
@@ -5361,7 +5025,7 @@ next:
free_leaf:
if (free)
- mt_free_rcu(&node->rcu);
+ kfree(node);
else
mt_clear_meta(mt, node, node->type);
}
@@ -5398,10 +5062,9 @@ static inline void mte_destroy_walk(struct maple_enode *enode,
*/
void *mas_store(struct ma_state *mas, void *entry)
{
- int request;
MA_WR_STATE(wr_mas, mas, entry);
- trace_ma_write(__func__, mas, 0, entry);
+ trace_ma_write(TP_FCT, mas, 0, entry);
#ifdef CONFIG_DEBUG_MAPLE_TREE
if (MAS_WARN_ON(mas, mas->index > mas->last))
pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last,
@@ -5428,11 +5091,11 @@ void *mas_store(struct ma_state *mas, void *entry)
return wr_mas.content;
}
- request = mas_prealloc_calc(&wr_mas, entry);
- if (!request)
+ mas_prealloc_calc(&wr_mas, entry);
+ if (!mas->node_request)
goto store;
- mas_node_count(mas, request);
+ mas_alloc_nodes(mas, GFP_NOWAIT);
if (mas_is_err(mas))
return NULL;
@@ -5502,7 +5165,7 @@ void mas_store_prealloc(struct ma_state *mas, void *entry)
}
store:
- trace_ma_write(__func__, mas, 0, entry);
+ trace_ma_write(TP_FCT, mas, 0, entry);
mas_wr_store_entry(&wr_mas);
MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
mas_destroy(mas);
@@ -5520,26 +5183,27 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc);
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
{
MA_WR_STATE(wr_mas, mas, entry);
- int ret = 0;
- int request;
mas_wr_prealloc_setup(&wr_mas);
mas->store_type = mas_wr_store_type(&wr_mas);
- request = mas_prealloc_calc(&wr_mas, entry);
- if (!request)
- return ret;
+ mas_prealloc_calc(&wr_mas, entry);
+ if (!mas->node_request)
+ goto set_flag;
- mas_node_count_gfp(mas, request, gfp);
+ mas->mas_flags &= ~MA_STATE_PREALLOC;
+ mas_alloc_nodes(mas, gfp);
if (mas_is_err(mas)) {
- mas_set_alloc_req(mas, 0);
- ret = xa_err(mas->node);
+ int ret = xa_err(mas->node);
+
+ mas->node_request = 0;
mas_destroy(mas);
mas_reset(mas);
return ret;
}
+set_flag:
mas->mas_flags |= MA_STATE_PREALLOC;
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(mas_preallocate);
@@ -5553,108 +5217,21 @@ EXPORT_SYMBOL_GPL(mas_preallocate);
*/
void mas_destroy(struct ma_state *mas)
{
- struct maple_alloc *node;
- unsigned long total;
-
- /*
- * When using mas_for_each() to insert an expected number of elements,
- * it is possible that the number inserted is less than the expected
- * number. To fix an invalid final node, a check is performed here to
- * rebalance the previous node with the final node.
- */
- if (mas->mas_flags & MA_STATE_REBALANCE) {
- unsigned char end;
- if (mas_is_err(mas))
- mas_reset(mas);
- mas_start(mas);
- mtree_range_walk(mas);
- end = mas->end + 1;
- if (end < mt_min_slot_count(mas->node) - 1)
- mas_destroy_rebalance(mas, end);
-
- mas->mas_flags &= ~MA_STATE_REBALANCE;
- }
- mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC);
-
- total = mas_allocated(mas);
- while (total) {
- node = mas->alloc;
- mas->alloc = node->slot[0];
- if (node->node_count > 1) {
- size_t count = node->node_count - 1;
-
- mt_free_bulk(count, (void __rcu **)&node->slot[1]);
- total -= count;
- }
- mt_free_one(ma_mnode_ptr(node));
- total--;
- }
-
- mas->alloc = NULL;
+ mas->mas_flags &= ~MA_STATE_PREALLOC;
+ mas_empty_nodes(mas);
}
EXPORT_SYMBOL_GPL(mas_destroy);
-/*
- * mas_expected_entries() - Set the expected number of entries that will be inserted.
- * @mas: The maple state
- * @nr_entries: The number of expected entries.
- *
- * This will attempt to pre-allocate enough nodes to store the expected number
- * of entries. The allocations will occur using the bulk allocator interface
- * for speed. Please call mas_destroy() on the @mas after inserting the entries
- * to ensure any unused nodes are freed.
- *
- * Return: 0 on success, -ENOMEM if memory could not be allocated.
- */
-int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries)
+static void mas_may_activate(struct ma_state *mas)
{
- int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2;
- struct maple_enode *enode = mas->node;
- int nr_nodes;
- int ret;
-
- /*
- * Sometimes it is necessary to duplicate a tree to a new tree, such as
- * forking a process and duplicating the VMAs from one tree to a new
- * tree. When such a situation arises, it is known that the new tree is
- * not going to be used until the entire tree is populated. For
- * performance reasons, it is best to use a bulk load with RCU disabled.
- * This allows for optimistic splitting that favours the left and reuse
- * of nodes during the operation.
- */
-
- /* Optimize splitting for bulk insert in-order */
- mas->mas_flags |= MA_STATE_BULK;
-
- /*
- * Avoid overflow, assume a gap between each entry and a trailing null.
- * If this is wrong, it just means allocation can happen during
- * insertion of entries.
- */
- nr_nodes = max(nr_entries, nr_entries * 2 + 1);
- if (!mt_is_alloc(mas->tree))
- nonleaf_cap = MAPLE_RANGE64_SLOTS - 2;
-
- /* Leaves; reduce slots to keep space for expansion */
- nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2);
- /* Internal nodes */
- nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap);
- /* Add working room for split (2 nodes) + new parents */
- mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL);
-
- /* Detect if allocations run out */
- mas->mas_flags |= MA_STATE_PREALLOC;
-
- if (!mas_is_err(mas))
- return 0;
-
- ret = xa_err(mas->node);
- mas->node = enode;
- mas_destroy(mas);
- return ret;
-
+ if (!mas->node) {
+ mas->status = ma_start;
+ } else if (mas->index > mas->max || mas->index < mas->min) {
+ mas->status = ma_start;
+ } else {
+ mas->status = ma_active;
+ }
}
-EXPORT_SYMBOL_GPL(mas_expected_entries);
static bool mas_next_setup(struct ma_state *mas, unsigned long max,
void **entry)
@@ -5679,11 +5256,11 @@ static bool mas_next_setup(struct ma_state *mas, unsigned long max,
break;
case ma_overflow:
/* Overflowed before, but the max changed */
- mas->status = ma_active;
+ mas_may_activate(mas);
break;
case ma_underflow:
/* The user expects the mas to be one before where it is */
- mas->status = ma_active;
+ mas_may_activate(mas);
*entry = mas_walk(mas);
if (*entry)
return true;
@@ -5804,11 +5381,11 @@ static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry
break;
case ma_underflow:
/* underflowed before but the min changed */
- mas->status = ma_active;
+ mas_may_activate(mas);
break;
case ma_overflow:
/* User expects mas to be one after where it is */
- mas->status = ma_active;
+ mas_may_activate(mas);
*entry = mas_walk(mas);
if (*entry)
return true;
@@ -5973,7 +5550,7 @@ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long m
return true;
}
- mas->status = ma_active;
+ mas_may_activate(mas);
*entry = mas_walk(mas);
if (*entry)
return true;
@@ -5982,7 +5559,7 @@ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long m
if (unlikely(mas->last >= max))
return true;
- mas->status = ma_active;
+ mas_may_activate(mas);
*entry = mas_walk(mas);
if (*entry)
return true;
@@ -6276,7 +5853,7 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp)
mas_alloc_nodes(mas, gfp);
}
- if (!mas_allocated(mas))
+ if (!mas->sheaf && !mas->alloc)
return false;
mas->status = ma_start;
@@ -6285,9 +5862,14 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp)
void __init maple_tree_init(void)
{
+ struct kmem_cache_args args = {
+ .align = sizeof(struct maple_node),
+ .sheaf_capacity = 32,
+ };
+
maple_node_cache = kmem_cache_create("maple_node",
- sizeof(struct maple_node), sizeof(struct maple_node),
- SLAB_PANIC, NULL);
+ sizeof(struct maple_node), &args,
+ SLAB_PANIC);
}
/**
@@ -6302,7 +5884,7 @@ void *mtree_load(struct maple_tree *mt, unsigned long index)
MA_STATE(mas, mt, index, index);
void *entry;
- trace_ma_read(__func__, &mas);
+ trace_ma_read(TP_FCT, &mas);
rcu_read_lock();
retry:
entry = mas_start(&mas);
@@ -6345,7 +5927,7 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index,
MA_STATE(mas, mt, index, last);
int ret = 0;
- trace_ma_write(__func__, &mas, 0, entry);
+ trace_ma_write(TP_FCT, &mas, 0, entry);
if (WARN_ON_ONCE(xa_is_advanced(entry)))
return -EINVAL;
@@ -6568,7 +6150,7 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index)
void *entry = NULL;
MA_STATE(mas, mt, index, index);
- trace_ma_op(__func__, &mas);
+ trace_ma_op(TP_FCT, &mas);
mtree_lock(mt);
entry = mas_erase(&mas);
@@ -6620,7 +6202,7 @@ static void mas_dup_free(struct ma_state *mas)
}
node = mte_to_node(mas->node);
- mt_free_one(node);
+ kfree(node);
}
/*
@@ -6661,7 +6243,7 @@ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas,
struct maple_node *node = mte_to_node(mas->node);
struct maple_node *new_node = mte_to_node(new_mas->node);
enum maple_type type;
- unsigned char request, count, i;
+ unsigned char count, i;
void __rcu **slots;
void __rcu **new_slots;
unsigned long val;
@@ -6669,20 +6251,17 @@ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas,
/* Allocate memory for child nodes. */
type = mte_node_type(mas->node);
new_slots = ma_slots(new_node, type);
- request = mas_data_end(mas) + 1;
- count = mt_alloc_bulk(gfp, request, (void **)new_slots);
- if (unlikely(count < request)) {
- memset(new_slots, 0, request * sizeof(void *));
- mas_set_err(mas, -ENOMEM);
+ count = mas->node_request = mas_data_end(mas) + 1;
+ mas_alloc_nodes(mas, gfp);
+ if (unlikely(mas_is_err(mas)))
return;
- }
- /* Restore node type information in slots. */
slots = ma_slots(node, type);
for (i = 0; i < count; i++) {
val = (unsigned long)mt_slot_locked(mas->tree, slots, i);
val &= MAPLE_NODE_MASK;
- ((unsigned long *)new_slots)[i] |= val;
+ new_slots[i] = ma_mnode_ptr((unsigned long)mas_pop_node(mas) |
+ val);
}
}
@@ -6736,7 +6315,7 @@ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas,
/* Only allocate child nodes for non-leaf nodes. */
mas_dup_alloc(mas, new_mas, gfp);
if (unlikely(mas_is_err(mas)))
- return;
+ goto empty_mas;
} else {
/*
* This is the last leaf node and duplication is
@@ -6769,6 +6348,8 @@ set_new_tree:
/* Make them the same height */
new_mas->tree->ma_flags = mas->tree->ma_flags;
rcu_assign_pointer(new_mas->tree->ma_root, root);
+empty_mas:
+ mas_empty_nodes(mas);
}
/**
@@ -6906,7 +6487,7 @@ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max)
unsigned long copy = *index;
#endif
- trace_ma_read(__func__, &mas);
+ trace_ma_read(TP_FCT, &mas);
if ((*index) > max)
return NULL;
@@ -7666,8 +7247,9 @@ void mas_dump(const struct ma_state *mas)
pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end,
mas->index, mas->last);
- pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n",
- mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags);
+ pr_err(" min=%lx max=%lx sheaf=" PTR_FMT ", request %lu depth=%u, flags=%x\n",
+ mas->min, mas->max, mas->sheaf, mas->node_request, mas->depth,
+ mas->mas_flags);
if (mas->index > mas->last)
pr_err("Check index & last\n");
}
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 5faa29208bdb..bf77b9843175 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -212,12 +212,13 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
#endif
- /* make sure c is not zero, trigger exception otherwise */
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdiv-by-zero"
- if (unlikely(c == 0))
- return 1/0;
-#pragma GCC diagnostic pop
+ /* make sure c is not zero, trigger runtime exception otherwise */
+ if (unlikely(c == 0)) {
+ unsigned long zero = 0;
+
+ OPTIMIZER_HIDE_VAR(zero);
+ return ~0UL/zero;
+ }
int shift = __builtin_ctzll(c);
diff --git a/lib/math/gcd.c b/lib/math/gcd.c
index e3b042214d1b..62efca6787ae 100644
--- a/lib/math/gcd.c
+++ b/lib/math/gcd.c
@@ -11,22 +11,16 @@
* has decent hardware division.
*/
+DEFINE_STATIC_KEY_TRUE(efficient_ffs_key);
+
#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
/* If __ffs is available, the even/odd algorithm benchmarks slower. */
-/**
- * gcd - calculate and return the greatest common divisor of 2 unsigned longs
- * @a: first value
- * @b: second value
- */
-unsigned long gcd(unsigned long a, unsigned long b)
+static unsigned long binary_gcd(unsigned long a, unsigned long b)
{
unsigned long r = a | b;
- if (!a || !b)
- return r;
-
b >>= __ffs(b);
if (b == 1)
return r & -r;
@@ -44,9 +38,15 @@ unsigned long gcd(unsigned long a, unsigned long b)
}
}
-#else
+#endif
/* If normalization is done by loops, the even/odd algorithm is a win. */
+
+/**
+ * gcd - calculate and return the greatest common divisor of 2 unsigned longs
+ * @a: first value
+ * @b: second value
+ */
unsigned long gcd(unsigned long a, unsigned long b)
{
unsigned long r = a | b;
@@ -54,6 +54,11 @@ unsigned long gcd(unsigned long a, unsigned long b)
if (!a || !b)
return r;
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+ if (static_branch_likely(&efficient_ffs_key))
+ return binary_gcd(a, b);
+#endif
+
/* Isolate lsbit of r */
r &= -r;
@@ -80,6 +85,4 @@ unsigned long gcd(unsigned long a, unsigned long b)
}
}
-#endif
-
EXPORT_SYMBOL_GPL(gcd);
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 75ce3e134b7c..799e0e5eac26 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -18,9 +18,6 @@
#else
#include <linux/module.h>
#include <linux/gfp.h>
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-EXPORT_SYMBOL(raid6_empty_zero_page);
#endif
struct raid6_calls raid6_call;
diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c
index 0a2e76035ea9..6d9474ce6da9 100644
--- a/lib/raid6/neon.c
+++ b/lib/raid6/neon.c
@@ -8,10 +8,9 @@
#include <linux/raid/pq.h>
#ifdef __KERNEL__
-#include <asm/neon.h>
+#include <asm/simd.h>
#else
-#define kernel_neon_begin()
-#define kernel_neon_end()
+#define scoped_ksimd()
#define cpu_has_neon() (1)
#endif
@@ -32,10 +31,9 @@
{ \
void raid6_neon ## _n ## _gen_syndrome_real(int, \
unsigned long, void**); \
- kernel_neon_begin(); \
- raid6_neon ## _n ## _gen_syndrome_real(disks, \
+ scoped_ksimd() \
+ raid6_neon ## _n ## _gen_syndrome_real(disks, \
(unsigned long)bytes, ptrs); \
- kernel_neon_end(); \
} \
static void raid6_neon ## _n ## _xor_syndrome(int disks, \
int start, int stop, \
@@ -43,10 +41,9 @@
{ \
void raid6_neon ## _n ## _xor_syndrome_real(int, \
int, int, unsigned long, void**); \
- kernel_neon_begin(); \
- raid6_neon ## _n ## _xor_syndrome_real(disks, \
- start, stop, (unsigned long)bytes, ptrs); \
- kernel_neon_end(); \
+ scoped_ksimd() \
+ raid6_neon ## _n ## _xor_syndrome_real(disks, \
+ start, stop, (unsigned long)bytes, ptrs);\
} \
struct raid6_calls const raid6_neonx ## _n = { \
raid6_neon ## _n ## _gen_syndrome, \
diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c
index a7c1b2bbe40d..b5e47c008b41 100644
--- a/lib/raid6/recov.c
+++ b/lib/raid6/recov.c
@@ -31,10 +31,10 @@ static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila,
Use the dead data pages as temporary storage for
delta p and delta q */
dp = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks-2] = dp;
dq = (u8 *)ptrs[failb];
- ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[failb] = raid6_get_zero_page();
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -72,7 +72,7 @@ static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila,
/* Compute syndrome with zero for the missing data page
Use the dead data page as temporary storage for delta q */
dq = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c
index 4e8095403ee2..97d598d2535c 100644
--- a/lib/raid6/recov_avx2.c
+++ b/lib/raid6/recov_avx2.c
@@ -28,10 +28,10 @@ static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila,
Use the dead data pages as temporary storage for
delta p and delta q */
dp = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks-2] = dp;
dq = (u8 *)ptrs[failb];
- ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[failb] = raid6_get_zero_page();
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -196,7 +196,7 @@ static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila,
/* Compute syndrome with zero for the missing data page
Use the dead data page as temporary storage for delta q */
dq = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c
index 310c715db313..7986120ca444 100644
--- a/lib/raid6/recov_avx512.c
+++ b/lib/raid6/recov_avx512.c
@@ -37,10 +37,10 @@ static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila,
*/
dp = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks-2] = dp;
dq = (u8 *)ptrs[failb];
- ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[failb] = raid6_get_zero_page();
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -238,7 +238,7 @@ static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila,
*/
dq = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c
index 94aeac85e6f7..93dc515997a1 100644
--- a/lib/raid6/recov_loongarch_simd.c
+++ b/lib/raid6/recov_loongarch_simd.c
@@ -42,10 +42,10 @@ static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
* delta p and delta q
*/
dp = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks - 2] = dp;
dq = (u8 *)ptrs[failb];
- ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[failb] = raid6_get_zero_page();
ptrs[disks - 1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -197,7 +197,7 @@ static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
* Use the dead data page as temporary storage for delta q
*/
dq = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks - 1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -316,10 +316,10 @@ static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
* delta p and delta q
*/
dp = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks - 2] = dp;
dq = (u8 *)ptrs[failb];
- ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[failb] = raid6_get_zero_page();
ptrs[disks - 1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -436,7 +436,7 @@ static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
* Use the dead data page as temporary storage for delta q
*/
dq = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks - 1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c
index 1bfc14174d4d..9d99aeabd31a 100644
--- a/lib/raid6/recov_neon.c
+++ b/lib/raid6/recov_neon.c
@@ -7,11 +7,10 @@
#include <linux/raid/pq.h>
#ifdef __KERNEL__
-#include <asm/neon.h>
+#include <asm/simd.h>
#include "neon.h"
#else
-#define kernel_neon_begin()
-#define kernel_neon_end()
+#define scoped_ksimd()
#define cpu_has_neon() (1)
#endif
@@ -36,10 +35,10 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
* delta p and delta q
*/
dp = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks - 2] = dp;
dq = (u8 *)ptrs[failb];
- ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[failb] = raid6_get_zero_page();
ptrs[disks - 1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -55,9 +54,8 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
raid6_gfexp[failb]]];
- kernel_neon_begin();
- __raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
- kernel_neon_end();
+ scoped_ksimd()
+ __raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
}
static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
@@ -74,7 +72,7 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
* Use the dead data page as temporary storage for delta q
*/
dq = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks - 1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -86,9 +84,8 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
/* Now, pick the proper data tables */
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
- kernel_neon_begin();
- __raid6_datap_recov_neon(bytes, p, q, dq, qmul);
- kernel_neon_end();
+ scoped_ksimd()
+ __raid6_datap_recov_neon(bytes, p, q, dq, qmul);
}
const struct raid6_recov_calls raid6_recov_neon = {
diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
index f29303795ccf..5f779719c3d3 100644
--- a/lib/raid6/recov_rvv.c
+++ b/lib/raid6/recov_rvv.c
@@ -4,9 +4,7 @@
* Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
*/
-#include <asm/simd.h>
#include <asm/vector.h>
-#include <crypto/internal/simd.h>
#include <linux/raid/pq.h>
static int rvv_has_vector(void)
@@ -165,10 +163,10 @@ static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila,
* delta p and delta q
*/
dp = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks - 2] = dp;
dq = (u8 *)ptrs[failb];
- ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[failb] = raid6_get_zero_page();
ptrs[disks - 1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -203,7 +201,7 @@ static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila,
* Use the dead data page as temporary storage for delta q
*/
dq = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks - 1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c
index 179eec900cea..487018f81192 100644
--- a/lib/raid6/recov_s390xc.c
+++ b/lib/raid6/recov_s390xc.c
@@ -6,7 +6,6 @@
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
*/
-#include <linux/export.h>
#include <linux/raid/pq.h>
static inline void xor_block(u8 *p1, u8 *p2)
@@ -35,10 +34,10 @@ static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila,
Use the dead data pages as temporary storage for
delta p and delta q */
dp = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks-2] = dp;
dq = (u8 *)ptrs[failb];
- ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[failb] = raid6_get_zero_page();
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -82,7 +81,7 @@ static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila,
/* Compute syndrome with zero for the missing data page
Use the dead data page as temporary storage for delta q */
dq = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c
index 4bfa3c6b60de..2e849185c32b 100644
--- a/lib/raid6/recov_ssse3.c
+++ b/lib/raid6/recov_ssse3.c
@@ -30,10 +30,10 @@ static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila,
Use the dead data pages as temporary storage for
delta p and delta q */
dp = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks-2] = dp;
dq = (u8 *)ptrs[failb];
- ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[failb] = raid6_get_zero_page();
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -203,7 +203,7 @@ static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila,
/* Compute syndrome with zero for the missing data page
Use the dead data page as temporary storage for delta q */
dq = (u8 *)ptrs[faila];
- ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[faila] = raid6_get_zero_page();
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
index f0887344b274..89da5fc247aa 100644
--- a/lib/raid6/rvv.c
+++ b/lib/raid6/rvv.c
@@ -9,11 +9,8 @@
* Copyright 2002-2004 H. Peter Anvin
*/
-#include <asm/simd.h>
#include <asm/vector.h>
-#include <crypto/internal/simd.h>
#include <linux/raid/pq.h>
-#include <linux/types.h>
#include "rvv.h"
#define NSIZE (riscv_v_vsize / 32) /* NSIZE = vlenb */
@@ -26,9 +23,9 @@ static int rvv_has_vector(void)
static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
- unsigned long d;
- int z, z0;
u8 *p, *q;
+ unsigned long vl, d;
+ int z, z0;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0 + 1]; /* XOR parity */
@@ -36,8 +33,9 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
- "vsetvli t0, x0, e8, m1, ta, ma\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
".option pop\n"
+ : "=&r" (vl)
);
/* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
@@ -46,7 +44,7 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE])
@@ -99,7 +97,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
- unsigned long d;
+ unsigned long vl, d;
int z, z0;
z0 = stop; /* P/Q right side optimization */
@@ -108,8 +106,9 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
- "vsetvli t0, x0, e8, m1, ta, ma\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
".option pop\n"
+ : "=&r" (vl)
);
/* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
@@ -118,7 +117,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE])
@@ -195,9 +194,9 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
- unsigned long d;
- int z, z0;
u8 *p, *q;
+ unsigned long vl, d;
+ int z, z0;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0 + 1]; /* XOR parity */
@@ -205,8 +204,9 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
- "vsetvli t0, x0, e8, m1, ta, ma\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
".option pop\n"
+ : "=&r" (vl)
);
/*
@@ -218,9 +218,9 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -287,7 +287,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
- unsigned long d;
+ unsigned long vl, d;
int z, z0;
z0 = stop; /* P/Q right side optimization */
@@ -296,8 +296,9 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
- "vsetvli t0, x0, e8, m1, ta, ma\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
".option pop\n"
+ : "=&r" (vl)
);
/*
@@ -309,9 +310,9 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -413,9 +414,9 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
- unsigned long d;
- int z, z0;
u8 *p, *q;
+ unsigned long vl, d;
+ int z, z0;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0 + 1]; /* XOR parity */
@@ -423,8 +424,9 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
- "vsetvli t0, x0, e8, m1, ta, ma\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
".option pop\n"
+ : "=&r" (vl)
);
/*
@@ -438,13 +440,13 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
"vle8.v v8, (%[wp2])\n"
- "vle8.v v9, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
"vle8.v v12, (%[wp3])\n"
- "vle8.v v13, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -539,7 +541,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
- unsigned long d;
+ unsigned long vl, d;
int z, z0;
z0 = stop; /* P/Q right side optimization */
@@ -548,8 +550,9 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
- "vsetvli t0, x0, e8, m1, ta, ma\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
".option pop\n"
+ : "=&r" (vl)
);
/*
@@ -563,13 +566,13 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
"vle8.v v8, (%[wp2])\n"
- "vle8.v v9, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
"vle8.v v12, (%[wp3])\n"
- "vle8.v v13, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -721,9 +724,9 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
{
u8 **dptr = (u8 **)ptrs;
- unsigned long d;
- int z, z0;
u8 *p, *q;
+ unsigned long vl, d;
+ int z, z0;
z0 = disks - 3; /* Highest data disk */
p = dptr[z0 + 1]; /* XOR parity */
@@ -731,8 +734,9 @@ static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
- "vsetvli t0, x0, e8, m1, ta, ma\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
".option pop\n"
+ : "=&r" (vl)
);
/*
@@ -750,21 +754,21 @@ static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
"vle8.v v8, (%[wp2])\n"
- "vle8.v v9, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
"vle8.v v12, (%[wp3])\n"
- "vle8.v v13, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
"vle8.v v16, (%[wp4])\n"
- "vle8.v v17, (%[wp4])\n"
+ "vmv.v.v v17, v16\n"
"vle8.v v20, (%[wp5])\n"
- "vle8.v v21, (%[wp5])\n"
+ "vmv.v.v v21, v20\n"
"vle8.v v24, (%[wp6])\n"
- "vle8.v v25, (%[wp6])\n"
+ "vmv.v.v v25, v24\n"
"vle8.v v28, (%[wp7])\n"
- "vle8.v v29, (%[wp7])\n"
+ "vmv.v.v v29, v28\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -915,7 +919,7 @@ static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
{
u8 **dptr = (u8 **)ptrs;
u8 *p, *q;
- unsigned long d;
+ unsigned long vl, d;
int z, z0;
z0 = stop; /* P/Q right side optimization */
@@ -924,8 +928,9 @@ static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
- "vsetvli t0, x0, e8, m1, ta, ma\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
".option pop\n"
+ : "=&r" (vl)
);
/*
@@ -943,21 +948,21 @@ static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
asm volatile (".option push\n"
".option arch,+v\n"
"vle8.v v0, (%[wp0])\n"
- "vle8.v v1, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
"vle8.v v4, (%[wp1])\n"
- "vle8.v v5, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
"vle8.v v8, (%[wp2])\n"
- "vle8.v v9, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
"vle8.v v12, (%[wp3])\n"
- "vle8.v v13, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
"vle8.v v16, (%[wp4])\n"
- "vle8.v v17, (%[wp4])\n"
+ "vmv.v.v v17, v16\n"
"vle8.v v20, (%[wp5])\n"
- "vle8.v v21, (%[wp5])\n"
+ "vmv.v.v v21, v20\n"
"vle8.v v24, (%[wp6])\n"
- "vle8.v v25, (%[wp6])\n"
+ "vmv.v.v v25, v24\n"
"vle8.v v28, (%[wp7])\n"
- "vle8.v v29, (%[wp7])\n"
+ "vmv.v.v v29, v28\n"
".option pop\n"
: :
[wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c
index cf5609b1ca79..258fb0e7abdf 100644
--- a/lib/ref_tracker.c
+++ b/lib/ref_tracker.c
@@ -8,6 +8,7 @@
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/stackdepot.h>
+#include <linux/seq_file.h>
#define REF_TRACKER_STACK_ENTRIES 16
#define STACK_BUF_SIZE 1024
@@ -28,6 +29,45 @@ struct ref_tracker_dir_stats {
} stacks[];
};
+#ifdef CONFIG_DEBUG_FS
+#include <linux/xarray.h>
+
+/*
+ * ref_tracker_dir_init() is usually called in allocation-safe contexts, but
+ * the same is not true of ref_tracker_dir_exit() which can be called from
+ * anywhere an object is freed. Removing debugfs dentries is a blocking
+ * operation, so we defer that work to the debugfs_reap_worker.
+ *
+ * Each dentry is tracked in the appropriate xarray. When
+ * ref_tracker_dir_exit() is called, its entries in the xarrays are marked and
+ * the workqueue job is scheduled. The worker then runs and deletes any marked
+ * dentries asynchronously.
+ */
+static struct xarray debugfs_dentries;
+static struct xarray debugfs_symlinks;
+static struct work_struct debugfs_reap_worker;
+
+#define REF_TRACKER_DIR_DEAD XA_MARK_0
+static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir)
+{
+ unsigned long flags;
+
+ xa_lock_irqsave(&debugfs_dentries, flags);
+ __xa_set_mark(&debugfs_dentries, (unsigned long)dir, REF_TRACKER_DIR_DEAD);
+ xa_unlock_irqrestore(&debugfs_dentries, flags);
+
+ xa_lock_irqsave(&debugfs_symlinks, flags);
+ __xa_set_mark(&debugfs_symlinks, (unsigned long)dir, REF_TRACKER_DIR_DEAD);
+ xa_unlock_irqrestore(&debugfs_symlinks, flags);
+
+ schedule_work(&debugfs_reap_worker);
+}
+#else
+static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir)
+{
+}
+#endif
+
static struct ref_tracker_dir_stats *
ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit)
{
@@ -35,7 +75,7 @@ ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit)
struct ref_tracker *tracker;
stats = kmalloc(struct_size(stats, stacks, limit),
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
if (!stats)
return ERR_PTR(-ENOMEM);
stats->total = 0;
@@ -63,21 +103,39 @@ ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit)
}
struct ostream {
+ void __ostream_printf (*func)(struct ostream *stream, char *fmt, ...);
+ char *prefix;
char *buf;
+ struct seq_file *seq;
int size, used;
};
+static void __ostream_printf pr_ostream_log(struct ostream *stream, char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+}
+
+static void __ostream_printf pr_ostream_buf(struct ostream *stream, char *fmt, ...)
+{
+ int ret, len = stream->size - stream->used;
+ va_list args;
+
+ va_start(args, fmt);
+ ret = vsnprintf(stream->buf + stream->used, len, fmt, args);
+ va_end(args);
+ if (ret > 0)
+ stream->used += min(ret, len);
+}
+
#define pr_ostream(stream, fmt, args...) \
({ \
struct ostream *_s = (stream); \
\
- if (!_s->buf) { \
- pr_err(fmt, ##args); \
- } else { \
- int ret, len = _s->size - _s->used; \
- ret = snprintf(_s->buf + _s->used, len, pr_fmt(fmt), ##args); \
- _s->used += min(ret, len); \
- } \
+ _s->func(_s, fmt, ##args); \
})
static void
@@ -96,25 +154,26 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir,
stats = ref_tracker_get_stats(dir, display_limit);
if (IS_ERR(stats)) {
- pr_ostream(s, "%s@%pK: couldn't get stats, error %pe\n",
- dir->name, dir, stats);
+ pr_ostream(s, "%s%s@%p: couldn't get stats, error %pe\n",
+ s->prefix, dir->class, dir, stats);
return;
}
- sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN);
+ sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT);
for (i = 0, skipped = stats->total; i < stats->count; ++i) {
stack = stats->stacks[i].stack_handle;
if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4))
sbuf[0] = 0;
- pr_ostream(s, "%s@%pK has %d/%d users at\n%s\n", dir->name, dir,
- stats->stacks[i].count, stats->total, sbuf);
+ pr_ostream(s, "%s%s@%p has %d/%d users at\n%s\n", s->prefix,
+ dir->class, dir, stats->stacks[i].count,
+ stats->total, sbuf);
skipped -= stats->stacks[i].count;
}
if (skipped)
- pr_ostream(s, "%s@%pK skipped reports about %d/%d users.\n",
- dir->name, dir, skipped, stats->total);
+ pr_ostream(s, "%s%s@%p skipped reports about %d/%d users.\n",
+ s->prefix, dir->class, dir, skipped, stats->total);
kfree(sbuf);
@@ -124,7 +183,8 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir,
void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir,
unsigned int display_limit)
{
- struct ostream os = {};
+ struct ostream os = { .func = pr_ostream_log,
+ .prefix = "ref_tracker: " };
__ref_tracker_dir_pr_ostream(dir, display_limit, &os);
}
@@ -143,7 +203,10 @@ EXPORT_SYMBOL(ref_tracker_dir_print);
int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size)
{
- struct ostream os = { .buf = buf, .size = size };
+ struct ostream os = { .func = pr_ostream_buf,
+ .prefix = "ref_tracker: ",
+ .buf = buf,
+ .size = size };
unsigned long flags;
spin_lock_irqsave(&dir->lock, flags);
@@ -161,6 +224,11 @@ void ref_tracker_dir_exit(struct ref_tracker_dir *dir)
bool leak = false;
dir->dead = true;
+ /*
+ * The xarray entries must be marked before the dir->lock is taken to
+ * protect simultaneous debugfs readers.
+ */
+ ref_tracker_debugfs_mark(dir);
spin_lock_irqsave(&dir->lock, flags);
list_for_each_entry_safe(tracker, n, &dir->quarantine, head) {
list_del(&tracker->head);
@@ -238,7 +306,7 @@ int ref_tracker_free(struct ref_tracker_dir *dir,
}
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
stack_handle = stack_depot_save(entries, nr_entries,
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
spin_lock_irqsave(&dir->lock, flags);
if (tracker->dead) {
@@ -273,3 +341,194 @@ int ref_tracker_free(struct ref_tracker_dir *dir,
return 0;
}
EXPORT_SYMBOL_GPL(ref_tracker_free);
+
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+
+static struct dentry *ref_tracker_debug_dir = (struct dentry *)-ENOENT;
+
+static void __ostream_printf pr_ostream_seq(struct ostream *stream, char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ seq_vprintf(stream->seq, fmt, args);
+ va_end(args);
+}
+
+static int ref_tracker_dir_seq_print(struct ref_tracker_dir *dir, struct seq_file *seq)
+{
+ struct ostream os = { .func = pr_ostream_seq,
+ .prefix = "",
+ .seq = seq };
+
+ __ref_tracker_dir_pr_ostream(dir, 16, &os);
+
+ return os.used;
+}
+
+static int ref_tracker_debugfs_show(struct seq_file *f, void *v)
+{
+ struct ref_tracker_dir *dir = f->private;
+ unsigned long index = (unsigned long)dir;
+ unsigned long flags;
+ int ret;
+
+ /*
+ * "dir" may not exist at this point if ref_tracker_dir_exit() has
+ * already been called. Take care not to dereference it until its
+ * legitimacy is established.
+ *
+ * The xa_lock is necessary to ensure that "dir" doesn't disappear
+ * before its lock can be taken. If it's in the hash and not marked
+ * dead, then it's safe to take dir->lock which prevents
+ * ref_tracker_dir_exit() from completing. Once the dir->lock is
+ * acquired, the xa_lock can be released. All of this must be IRQ-safe.
+ */
+ xa_lock_irqsave(&debugfs_dentries, flags);
+ if (!xa_load(&debugfs_dentries, index) ||
+ xa_get_mark(&debugfs_dentries, index, REF_TRACKER_DIR_DEAD)) {
+ xa_unlock_irqrestore(&debugfs_dentries, flags);
+ return -ENODATA;
+ }
+
+ spin_lock(&dir->lock);
+ xa_unlock(&debugfs_dentries);
+ ret = ref_tracker_dir_seq_print(dir, f);
+ spin_unlock_irqrestore(&dir->lock, flags);
+ return ret;
+}
+
+static int ref_tracker_debugfs_open(struct inode *inode, struct file *filp)
+{
+ struct ref_tracker_dir *dir = inode->i_private;
+
+ return single_open(filp, ref_tracker_debugfs_show, dir);
+}
+
+static const struct file_operations ref_tracker_debugfs_fops = {
+ .owner = THIS_MODULE,
+ .open = ref_tracker_debugfs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/**
+ * ref_tracker_dir_debugfs - create debugfs file for ref_tracker_dir
+ * @dir: ref_tracker_dir to be associated with debugfs file
+ *
+ * In most cases, a debugfs file will be created automatically for every
+ * ref_tracker_dir. If the object was created before debugfs is brought up
+ * then that may fail. In those cases, it is safe to call this at a later
+ * time to create the file.
+ */
+void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir)
+{
+ char name[NAME_MAX + 1];
+ struct dentry *dentry;
+ int ret;
+
+ /* No-op if already created */
+ dentry = xa_load(&debugfs_dentries, (unsigned long)dir);
+ if (dentry && !xa_is_err(dentry))
+ return;
+
+ ret = snprintf(name, sizeof(name), "%s@%p", dir->class, dir);
+ name[sizeof(name) - 1] = '\0';
+
+ if (ret < sizeof(name)) {
+ dentry = debugfs_create_file(name, S_IFREG | 0400,
+ ref_tracker_debug_dir, dir,
+ &ref_tracker_debugfs_fops);
+ if (!IS_ERR(dentry)) {
+ void *old;
+
+ old = xa_store_irq(&debugfs_dentries, (unsigned long)dir,
+ dentry, GFP_KERNEL);
+
+ if (xa_is_err(old))
+ debugfs_remove(dentry);
+ else
+ WARN_ON_ONCE(old);
+ }
+ }
+}
+EXPORT_SYMBOL(ref_tracker_dir_debugfs);
+
+void __ostream_printf ref_tracker_dir_symlink(struct ref_tracker_dir *dir, const char *fmt, ...)
+{
+ char name[NAME_MAX + 1];
+ struct dentry *symlink, *dentry;
+ va_list args;
+ int ret;
+
+ symlink = xa_load(&debugfs_symlinks, (unsigned long)dir);
+ dentry = xa_load(&debugfs_dentries, (unsigned long)dir);
+
+ /* Already created?*/
+ if (symlink && !xa_is_err(symlink))
+ return;
+
+ if (!dentry || xa_is_err(dentry))
+ return;
+
+ va_start(args, fmt);
+ ret = vsnprintf(name, sizeof(name), fmt, args);
+ va_end(args);
+ name[sizeof(name) - 1] = '\0';
+
+ if (ret < sizeof(name)) {
+ symlink = debugfs_create_symlink(name, ref_tracker_debug_dir,
+ dentry->d_name.name);
+ if (!IS_ERR(symlink)) {
+ void *old;
+
+ old = xa_store_irq(&debugfs_symlinks, (unsigned long)dir,
+ symlink, GFP_KERNEL);
+ if (xa_is_err(old))
+ debugfs_remove(symlink);
+ else
+ WARN_ON_ONCE(old);
+ }
+ }
+}
+EXPORT_SYMBOL(ref_tracker_dir_symlink);
+
+static void debugfs_reap_work(struct work_struct *work)
+{
+ struct dentry *dentry;
+ unsigned long index;
+ bool reaped;
+
+ do {
+ reaped = false;
+ xa_for_each_marked(&debugfs_symlinks, index, dentry, REF_TRACKER_DIR_DEAD) {
+ xa_erase_irq(&debugfs_symlinks, index);
+ debugfs_remove(dentry);
+ reaped = true;
+ }
+ xa_for_each_marked(&debugfs_dentries, index, dentry, REF_TRACKER_DIR_DEAD) {
+ xa_erase_irq(&debugfs_dentries, index);
+ debugfs_remove(dentry);
+ reaped = true;
+ }
+ } while (reaped);
+}
+
+static int __init ref_tracker_debugfs_postcore_init(void)
+{
+ INIT_WORK(&debugfs_reap_worker, debugfs_reap_work);
+ xa_init_flags(&debugfs_dentries, XA_FLAGS_LOCK_IRQ);
+ xa_init_flags(&debugfs_symlinks, XA_FLAGS_LOCK_IRQ);
+ return 0;
+}
+postcore_initcall(ref_tracker_debugfs_postcore_init);
+
+static int __init ref_tracker_debugfs_late_init(void)
+{
+ ref_tracker_debug_dir = debugfs_create_dir("ref_tracker", NULL);
+ return 0;
+}
+late_initcall(ref_tracker_debugfs_late_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 3e555d012ed6..fde0f0e556f8 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -184,8 +184,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
static struct lock_class_key __key;
tbl = alloc_hooks_tag(ht->alloc_tag,
- kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets),
- gfp|__GFP_ZERO, NUMA_NO_NODE));
+ kvmalloc_node_align_noprof(struct_size(tbl, buckets, nbuckets),
+ 1, gfp|__GFP_ZERO, NUMA_NO_NODE));
size = nbuckets;
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index d3412984170c..4d188d05db15 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -208,8 +208,28 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
return nr;
}
+static unsigned int __map_depth_with_shallow(const struct sbitmap *sb,
+ int index,
+ unsigned int shallow_depth)
+{
+ u64 shallow_word_depth;
+ unsigned int word_depth, reminder;
+
+ word_depth = __map_depth(sb, index);
+ if (shallow_depth >= sb->depth)
+ return word_depth;
+
+ shallow_word_depth = word_depth * shallow_depth;
+ reminder = do_div(shallow_word_depth, sb->depth);
+
+ if (reminder >= (index + 1) * word_depth)
+ shallow_word_depth++;
+
+ return (unsigned int)shallow_word_depth;
+}
+
static int sbitmap_find_bit(struct sbitmap *sb,
- unsigned int depth,
+ unsigned int shallow_depth,
unsigned int index,
unsigned int alloc_hint,
bool wrap)
@@ -218,12 +238,12 @@ static int sbitmap_find_bit(struct sbitmap *sb,
int nr = -1;
for (i = 0; i < sb->map_nr; i++) {
- nr = sbitmap_find_bit_in_word(&sb->map[index],
- min_t(unsigned int,
- __map_depth(sb, index),
- depth),
- alloc_hint, wrap);
+ unsigned int depth = __map_depth_with_shallow(sb, index,
+ shallow_depth);
+ if (depth)
+ nr = sbitmap_find_bit_in_word(&sb->map[index], depth,
+ alloc_hint, wrap);
if (nr != -1) {
nr += index << sb->shift;
break;
@@ -287,7 +307,22 @@ static int __sbitmap_get_shallow(struct sbitmap *sb,
return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true);
}
-int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
+/**
+ * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
+ * limiting the depth used from each word.
+ * @sb: Bitmap to allocate from.
+ * @shallow_depth: The maximum number of bits to allocate from the bitmap.
+ *
+ * This rather specific operation allows for having multiple users with
+ * different allocation limits. E.g., there can be a high-priority class that
+ * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
+ * with a @shallow_depth of (sb->depth >> 1). Then, the low-priority
+ * class can only allocate half of the total bits in the bitmap, preventing it
+ * from starving out the high-priority class.
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+static int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
{
int nr;
unsigned int hint, depth;
@@ -302,7 +337,6 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
return nr;
}
-EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
bool sbitmap_any_bit_set(const struct sbitmap *sb)
{
@@ -406,27 +440,9 @@ EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
unsigned int depth)
{
- unsigned int wake_batch;
- unsigned int shallow_depth;
-
- /*
- * Each full word of the bitmap has bits_per_word bits, and there might
- * be a partial word. There are depth / bits_per_word full words and
- * depth % bits_per_word bits left over. In bitwise arithmetic:
- *
- * bits_per_word = 1 << shift
- * depth / bits_per_word = depth >> shift
- * depth % bits_per_word = depth & ((1 << shift) - 1)
- *
- * Each word can be limited to sbq->min_shallow_depth bits.
- */
- shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
- depth = ((depth >> sbq->sb.shift) * shallow_depth +
- min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
- wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
- SBQ_WAKE_BATCH);
-
- return wake_batch;
+ return clamp_t(unsigned int,
+ min(depth, sbq->min_shallow_depth) / SBQ_WAIT_QUEUES,
+ 1, SBQ_WAKE_BATCH);
}
int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index a2bb7738c373..94b3f6b19538 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -22,10 +22,8 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
if (is_percpu_thread())
goto out;
-#ifdef CONFIG_SMP
if (current->migration_disabled)
goto out;
-#endif
/*
* It is valid to assume CPU-locality during early bootup:
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 73d7b50924ef..de0b0025af2b 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -36,11 +36,11 @@
#include <linux/memblock.h>
#include <linux/kasan-enabled.h>
-#define DEPOT_POOLS_CAP 8192
-/* The pool_index is offset by 1 so the first record does not have a 0 handle. */
-#define DEPOT_MAX_POOLS \
- (((1LL << (DEPOT_POOL_INDEX_BITS)) - 1 < DEPOT_POOLS_CAP) ? \
- (1LL << (DEPOT_POOL_INDEX_BITS)) - 1 : DEPOT_POOLS_CAP)
+/*
+ * The pool_index is offset by 1 so the first record does not have a 0 handle.
+ */
+static unsigned int stack_max_pools __read_mostly =
+ MIN((1LL << DEPOT_POOL_INDEX_BITS) - 1, 8192);
static bool stack_depot_disabled;
static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
@@ -62,7 +62,7 @@ static unsigned int stack_bucket_number_order;
static unsigned int stack_hash_mask;
/* Array of memory regions that store stack records. */
-static void *stack_pools[DEPOT_MAX_POOLS];
+static void **stack_pools;
/* Newly allocated pool that is not yet added to stack_pools. */
static void *new_pool;
/* Number of pools in stack_pools. */
@@ -101,6 +101,34 @@ static int __init disable_stack_depot(char *str)
}
early_param("stack_depot_disable", disable_stack_depot);
+static int __init parse_max_pools(char *str)
+{
+ const long long limit = (1LL << (DEPOT_POOL_INDEX_BITS)) - 1;
+ unsigned int max_pools;
+ int rv;
+
+ rv = kstrtouint(str, 0, &max_pools);
+ if (rv)
+ return rv;
+
+ if (max_pools < 1024) {
+ pr_err("stack_depot_max_pools below 1024, using default of %u\n",
+ stack_max_pools);
+ goto out;
+ }
+
+ if (max_pools > limit) {
+ pr_err("stack_depot_max_pools exceeds %lld, using default of %u\n",
+ limit, stack_max_pools);
+ goto out;
+ }
+
+ stack_max_pools = max_pools;
+out:
+ return 0;
+}
+early_param("stack_depot_max_pools", parse_max_pools);
+
void __init stack_depot_request_early_init(void)
{
/* Too late to request early init now. */
@@ -182,6 +210,17 @@ int __init stack_depot_early_init(void)
}
init_stack_table(entries);
+ pr_info("allocating space for %u stack pools via memblock\n",
+ stack_max_pools);
+ stack_pools =
+ memblock_alloc(stack_max_pools * sizeof(void *), PAGE_SIZE);
+ if (!stack_pools) {
+ pr_err("stack pools allocation failed, disabling\n");
+ memblock_free(stack_table, entries * sizeof(struct list_head));
+ stack_depot_disabled = true;
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -231,6 +270,16 @@ int stack_depot_init(void)
stack_hash_mask = entries - 1;
init_stack_table(entries);
+ pr_info("allocating space for %u stack pools via kvcalloc\n",
+ stack_max_pools);
+ stack_pools = kvcalloc(stack_max_pools, sizeof(void *), GFP_KERNEL);
+ if (!stack_pools) {
+ pr_err("stack pools allocation failed, disabling\n");
+ kvfree(stack_table);
+ stack_depot_disabled = true;
+ ret = -ENOMEM;
+ }
+
out_unlock:
mutex_unlock(&stack_depot_init_mutex);
@@ -245,9 +294,9 @@ static bool depot_init_pool(void **prealloc)
{
lockdep_assert_held(&pool_lock);
- if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
+ if (unlikely(pools_num >= stack_max_pools)) {
/* Bail out if we reached the pool limit. */
- WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */
+ WARN_ON_ONCE(pools_num > stack_max_pools); /* should never happen */
WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */
WARN_ONCE(1, "Stack depot reached limit capacity");
return false;
@@ -273,7 +322,7 @@ static bool depot_init_pool(void **prealloc)
* NULL; do not reset to NULL if we have reached the maximum number of
* pools.
*/
- if (pools_num < DEPOT_MAX_POOLS)
+ if (pools_num < stack_max_pools)
WRITE_ONCE(new_pool, NULL);
else
WRITE_ONCE(new_pool, STACK_DEPOT_POISON);
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 6dc234913dd5..5bb752ff7c61 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -126,7 +126,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
if (can_do_masked_user_access()) {
long retval;
- src = masked_user_access_begin(src);
+ src = masked_user_read_access_begin(src);
retval = do_strncpy_from_user(dst, src, count, count);
user_read_access_end();
return retval;
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index 6e489f9e90f1..4a6574b67f82 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -99,7 +99,7 @@ long strnlen_user(const char __user *str, long count)
if (can_do_masked_user_access()) {
long retval;
- str = masked_user_access_begin(str);
+ str = masked_user_read_access_begin(str);
retval = do_strnlen_user(str, count, count);
user_read_access_end();
return retval;
diff --git a/lib/sys_info.c b/lib/sys_info.c
new file mode 100644
index 000000000000..496f9151c9b6
--- /dev/null
+++ b/lib/sys_info.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched/debug.h>
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/sysctl.h>
+#include <linux/nmi.h>
+
+#include <linux/sys_info.h>
+
+struct sys_info_name {
+ unsigned long bit;
+ const char *name;
+};
+
+/*
+ * When 'si_names' gets updated, please make sure the 'sys_info_avail'
+ * below is updated accordingly.
+ */
+static const struct sys_info_name si_names[] = {
+ { SYS_INFO_TASKS, "tasks" },
+ { SYS_INFO_MEM, "mem" },
+ { SYS_INFO_TIMERS, "timers" },
+ { SYS_INFO_LOCKS, "locks" },
+ { SYS_INFO_FTRACE, "ftrace" },
+ { SYS_INFO_ALL_CPU_BT, "all_bt" },
+ { SYS_INFO_BLOCKED_TASKS, "blocked_tasks" },
+};
+
+/* Expecting string like "xxx_sys_info=tasks,mem,timers,locks,ftrace,..." */
+unsigned long sys_info_parse_param(char *str)
+{
+ unsigned long si_bits = 0;
+ char *s, *name;
+ int i;
+
+ s = str;
+ while ((name = strsep(&s, ",")) && *name) {
+ for (i = 0; i < ARRAY_SIZE(si_names); i++) {
+ if (!strcmp(name, si_names[i].name)) {
+ si_bits |= si_names[i].bit;
+ break;
+ }
+ }
+ }
+
+ return si_bits;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static const char sys_info_avail[] __maybe_unused = "tasks,mem,timers,locks,ftrace,all_bt,blocked_tasks";
+
+int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ char names[sizeof(sys_info_avail)];
+ struct ctl_table table;
+ unsigned long *si_bits_global;
+
+ si_bits_global = ro_table->data;
+
+ if (write) {
+ unsigned long si_bits;
+ int ret;
+
+ table = *ro_table;
+ table.data = names;
+ table.maxlen = sizeof(names);
+ ret = proc_dostring(&table, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ si_bits = sys_info_parse_param(names);
+ /* The access to the global value is not synchronized. */
+ WRITE_ONCE(*si_bits_global, si_bits);
+ return 0;
+ } else {
+ /* for 'read' operation */
+ char *delim = "";
+ int i, len = 0;
+
+ names[0] = '\0';
+ for (i = 0; i < ARRAY_SIZE(si_names); i++) {
+ if (*si_bits_global & si_names[i].bit) {
+ len += scnprintf(names + len, sizeof(names) - len,
+ "%s%s", delim, si_names[i].name);
+ delim = ",";
+ }
+ }
+
+ table = *ro_table;
+ table.data = names;
+ table.maxlen = sizeof(names);
+ return proc_dostring(&table, write, buffer, lenp, ppos);
+ }
+}
+#endif
+
+void sys_info(unsigned long si_mask)
+{
+ if (si_mask & SYS_INFO_TASKS)
+ show_state();
+
+ if (si_mask & SYS_INFO_MEM)
+ show_mem();
+
+ if (si_mask & SYS_INFO_TIMERS)
+ sysrq_timer_list_show();
+
+ if (si_mask & SYS_INFO_LOCKS)
+ debug_show_all_locks();
+
+ if (si_mask & SYS_INFO_FTRACE)
+ ftrace_dump(DUMP_ALL);
+
+ if (si_mask & SYS_INFO_ALL_CPU_BT)
+ trigger_all_cpu_backtrace();
+
+ if (si_mask & SYS_INFO_BLOCKED_TASKS)
+ show_state_filter(TASK_UNINTERRUPTIBLE);
+}
diff --git a/lib/test_firmware.c b/lib/test_firmware.c
index 211222e63328..be4f93124901 100644
--- a/lib/test_firmware.c
+++ b/lib/test_firmware.c
@@ -26,6 +26,7 @@
#include <linux/kthread.h>
#include <linux/vmalloc.h>
#include <linux/efi_embedded_fw.h>
+#include <linux/string_choices.h>
MODULE_IMPORT_NS("TEST_FIRMWARE");
@@ -304,17 +305,17 @@ static ssize_t config_show(struct device *dev,
"FW_ACTION_NOUEVENT");
len += scnprintf(buf + len, PAGE_SIZE - len,
"into_buf:\t\t%s\n",
- test_fw_config->into_buf ? "true" : "false");
+ str_true_false(test_fw_config->into_buf));
len += scnprintf(buf + len, PAGE_SIZE - len,
"buf_size:\t%zu\n", test_fw_config->buf_size);
len += scnprintf(buf + len, PAGE_SIZE - len,
"file_offset:\t%zu\n", test_fw_config->file_offset);
len += scnprintf(buf + len, PAGE_SIZE - len,
"partial:\t\t%s\n",
- test_fw_config->partial ? "true" : "false");
+ str_true_false(test_fw_config->partial));
len += scnprintf(buf + len, PAGE_SIZE - len,
"sync_direct:\t\t%s\n",
- test_fw_config->sync_direct ? "true" : "false");
+ str_true_false(test_fw_config->sync_direct));
len += scnprintf(buf + len, PAGE_SIZE - len,
"read_fw_idx:\t%u\n", test_fw_config->read_fw_idx);
if (test_fw_config->upload_name)
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 5b144bc5c4ec..83e3d8208a54 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -140,7 +140,7 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce,
static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
{
return (mdevice->zone_device_type ==
- HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
+ HMM_DMIRROR_MEMORY_DEVICE_PRIVATE);
}
static enum migrate_vma_direction
@@ -330,7 +330,7 @@ static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
{
struct mm_struct *mm = dmirror->notifier.mm;
unsigned long addr;
- unsigned long pfns[64];
+ unsigned long pfns[32];
struct hmm_range range = {
.notifier = &dmirror->notifier,
.hmm_pfns = pfns,
@@ -879,8 +879,8 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror,
unsigned long size = cmd->npages << PAGE_SHIFT;
struct mm_struct *mm = dmirror->notifier.mm;
struct vm_area_struct *vma;
- unsigned long src_pfns[64] = { 0 };
- unsigned long dst_pfns[64] = { 0 };
+ unsigned long src_pfns[32] = { 0 };
+ unsigned long dst_pfns[32] = { 0 };
struct migrate_vma args = { 0 };
unsigned long next;
int ret;
@@ -939,8 +939,8 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror,
unsigned long size = cmd->npages << PAGE_SHIFT;
struct mm_struct *mm = dmirror->notifier.mm;
struct vm_area_struct *vma;
- unsigned long src_pfns[64] = { 0 };
- unsigned long dst_pfns[64] = { 0 };
+ unsigned long src_pfns[32] = { 0 };
+ unsigned long dst_pfns[32] = { 0 };
struct dmirror_bounce bounce;
struct migrate_vma args = { 0 };
unsigned long next;
@@ -1144,8 +1144,8 @@ static int dmirror_snapshot(struct dmirror *dmirror,
unsigned long size = cmd->npages << PAGE_SHIFT;
unsigned long addr;
unsigned long next;
- unsigned long pfns[64];
- unsigned char perm[64];
+ unsigned long pfns[32];
+ unsigned char perm[32];
char __user *uptr;
struct hmm_range range = {
.hmm_pfns = pfns,
diff --git a/lib/test_kho.c b/lib/test_kho.c
new file mode 100644
index 000000000000..fff018e5548d
--- /dev/null
+++ b/lib/test_kho.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test module for KHO
+ * Copyright (c) 2025 Microsoft Corporation.
+ *
+ * Authors:
+ * Saurabh Sengar <ssengar@microsoft.com>
+ * Mike Rapoport <rppt@kernel.org>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/libfdt.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/vmalloc.h>
+#include <linux/kexec_handover.h>
+
+#include <net/checksum.h>
+
+#define KHO_TEST_MAGIC 0x4b484f21 /* KHO! */
+#define KHO_TEST_FDT "kho_test"
+#define KHO_TEST_COMPAT "kho-test-v1"
+
+static long max_mem = (PAGE_SIZE << MAX_PAGE_ORDER) * 2;
+module_param(max_mem, long, 0644);
+
+struct kho_test_state {
+ unsigned int nr_folios;
+ struct folio **folios;
+ phys_addr_t *folios_info;
+ struct folio *fdt;
+ __wsum csum;
+};
+
+static struct kho_test_state kho_test_state;
+
+static int kho_test_notifier(struct notifier_block *self, unsigned long cmd,
+ void *v)
+{
+ struct kho_test_state *state = &kho_test_state;
+ struct kho_serialization *ser = v;
+ int err = 0;
+
+ switch (cmd) {
+ case KEXEC_KHO_ABORT:
+ return NOTIFY_DONE;
+ case KEXEC_KHO_FINALIZE:
+ /* Handled below */
+ break;
+ default:
+ return NOTIFY_BAD;
+ }
+
+ err |= kho_preserve_folio(state->fdt);
+ err |= kho_add_subtree(ser, KHO_TEST_FDT, folio_address(state->fdt));
+
+ return err ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+static struct notifier_block kho_test_nb = {
+ .notifier_call = kho_test_notifier,
+};
+
+static int kho_test_save_data(struct kho_test_state *state, void *fdt)
+{
+ phys_addr_t *folios_info __free(kvfree) = NULL;
+ struct kho_vmalloc folios_info_phys;
+ int err = 0;
+
+ folios_info = vmalloc_array(state->nr_folios, sizeof(*folios_info));
+ if (!folios_info)
+ return -ENOMEM;
+
+ err = kho_preserve_vmalloc(folios_info, &folios_info_phys);
+ if (err)
+ return err;
+
+ for (int i = 0; i < state->nr_folios; i++) {
+ struct folio *folio = state->folios[i];
+ unsigned int order = folio_order(folio);
+
+ folios_info[i] = virt_to_phys(folio_address(folio)) | order;
+
+ err = kho_preserve_folio(folio);
+ if (err)
+ break;
+ }
+
+ err |= fdt_begin_node(fdt, "data");
+ err |= fdt_property(fdt, "nr_folios", &state->nr_folios,
+ sizeof(state->nr_folios));
+ err |= fdt_property(fdt, "folios_info", &folios_info_phys,
+ sizeof(folios_info_phys));
+ err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum));
+ err |= fdt_end_node(fdt);
+
+ if (!err)
+ state->folios_info = no_free_ptr(folios_info);
+
+ return err;
+}
+
+static int kho_test_prepare_fdt(struct kho_test_state *state)
+{
+ const char compatible[] = KHO_TEST_COMPAT;
+ unsigned int magic = KHO_TEST_MAGIC;
+ ssize_t fdt_size;
+ int err = 0;
+ void *fdt;
+
+ fdt_size = state->nr_folios * sizeof(phys_addr_t) + PAGE_SIZE;
+ state->fdt = folio_alloc(GFP_KERNEL, get_order(fdt_size));
+ if (!state->fdt)
+ return -ENOMEM;
+
+ fdt = folio_address(state->fdt);
+
+ err |= fdt_create(fdt, fdt_size);
+ err |= fdt_finish_reservemap(fdt);
+
+ err |= fdt_begin_node(fdt, "");
+ err |= fdt_property(fdt, "compatible", compatible, sizeof(compatible));
+ err |= fdt_property(fdt, "magic", &magic, sizeof(magic));
+ err |= kho_test_save_data(state, fdt);
+ err |= fdt_end_node(fdt);
+
+ err |= fdt_finish(fdt);
+
+ if (err)
+ folio_put(state->fdt);
+
+ return err;
+}
+
+static int kho_test_generate_data(struct kho_test_state *state)
+{
+ size_t alloc_size = 0;
+ __wsum csum = 0;
+
+ while (alloc_size < max_mem) {
+ int order = get_random_u32() % NR_PAGE_ORDERS;
+ struct folio *folio;
+ unsigned int size;
+ void *addr;
+
+ /*
+ * Since get_order() rounds up, make sure that actual
+ * allocation is smaller so that we won't exceed max_mem
+ */
+ if (alloc_size + (PAGE_SIZE << order) > max_mem) {
+ order = get_order(max_mem - alloc_size);
+ if (order)
+ order--;
+ }
+ size = PAGE_SIZE << order;
+
+ folio = folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
+ if (!folio)
+ goto err_free_folios;
+
+ state->folios[state->nr_folios++] = folio;
+ addr = folio_address(folio);
+ get_random_bytes(addr, size);
+ csum = csum_partial(addr, size, csum);
+ alloc_size += size;
+ }
+
+ state->csum = csum;
+ return 0;
+
+err_free_folios:
+ for (int i = 0; i < state->nr_folios; i++)
+ folio_put(state->folios[i]);
+ state->nr_folios = 0;
+ return -ENOMEM;
+}
+
+static int kho_test_save(void)
+{
+ struct kho_test_state *state = &kho_test_state;
+ struct folio **folios;
+ unsigned long max_nr;
+ int err;
+
+ max_mem = PAGE_ALIGN(max_mem);
+ max_nr = max_mem >> PAGE_SHIFT;
+
+ folios = kvmalloc_array(max_nr, sizeof(*state->folios), GFP_KERNEL);
+ if (!folios)
+ return -ENOMEM;
+ state->folios = folios;
+
+ err = kho_test_generate_data(state);
+ if (err)
+ goto err_free_folios;
+
+ err = kho_test_prepare_fdt(state);
+ if (err)
+ goto err_free_folios;
+
+ err = register_kho_notifier(&kho_test_nb);
+ if (err)
+ goto err_free_fdt;
+
+ return 0;
+
+err_free_fdt:
+ folio_put(state->fdt);
+err_free_folios:
+ kvfree(folios);
+ return err;
+}
+
+static int kho_test_restore_data(const void *fdt, int node)
+{
+ const struct kho_vmalloc *folios_info_phys;
+ const unsigned int *nr_folios;
+ phys_addr_t *folios_info;
+ const __wsum *old_csum;
+ __wsum csum = 0;
+ int len;
+
+ node = fdt_path_offset(fdt, "/data");
+
+ nr_folios = fdt_getprop(fdt, node, "nr_folios", &len);
+ if (!nr_folios || len != sizeof(*nr_folios))
+ return -EINVAL;
+
+ old_csum = fdt_getprop(fdt, node, "csum", &len);
+ if (!old_csum || len != sizeof(*old_csum))
+ return -EINVAL;
+
+ folios_info_phys = fdt_getprop(fdt, node, "folios_info", &len);
+ if (!folios_info_phys || len != sizeof(*folios_info_phys))
+ return -EINVAL;
+
+ folios_info = kho_restore_vmalloc(folios_info_phys);
+ if (!folios_info)
+ return -EINVAL;
+
+ for (int i = 0; i < *nr_folios; i++) {
+ unsigned int order = folios_info[i] & ~PAGE_MASK;
+ phys_addr_t phys = folios_info[i] & PAGE_MASK;
+ unsigned int size = PAGE_SIZE << order;
+ struct folio *folio;
+
+ folio = kho_restore_folio(phys);
+ if (!folio)
+ break;
+
+ if (folio_order(folio) != order)
+ break;
+
+ csum = csum_partial(folio_address(folio), size, csum);
+ folio_put(folio);
+ }
+
+ vfree(folios_info);
+
+ if (csum != *old_csum)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int kho_test_restore(phys_addr_t fdt_phys)
+{
+ void *fdt = phys_to_virt(fdt_phys);
+ const unsigned int *magic;
+ int node, len, err;
+
+ node = fdt_path_offset(fdt, "/");
+ if (node < 0)
+ return -EINVAL;
+
+ if (fdt_node_check_compatible(fdt, node, KHO_TEST_COMPAT))
+ return -EINVAL;
+
+ magic = fdt_getprop(fdt, node, "magic", &len);
+ if (!magic || len != sizeof(*magic))
+ return -EINVAL;
+
+ if (*magic != KHO_TEST_MAGIC)
+ return -EINVAL;
+
+ err = kho_test_restore_data(fdt, node);
+ if (err)
+ return err;
+
+ pr_info("KHO restore succeeded\n");
+ return 0;
+}
+
+static int __init kho_test_init(void)
+{
+ phys_addr_t fdt_phys;
+ int err;
+
+ if (!kho_is_enabled())
+ return 0;
+
+ err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys);
+ if (!err)
+ return kho_test_restore(fdt_phys);
+
+ if (err != -ENOENT) {
+ pr_warn("failed to retrieve %s FDT: %d\n", KHO_TEST_FDT, err);
+ return err;
+ }
+
+ return kho_test_save();
+}
+module_init(kho_test_init);
+
+static void kho_test_cleanup(void)
+{
+ for (int i = 0; i < kho_test_state.nr_folios; i++)
+ folio_put(kho_test_state.folios[i]);
+
+ kvfree(kho_test_state.folios);
+ vfree(kho_test_state.folios_info);
+ folio_put(kho_test_state.fdt);
+}
+
+static void __exit kho_test_exit(void)
+{
+ unregister_kho_notifier(&kho_test_nb);
+ kho_test_cleanup();
+}
+module_exit(kho_test_exit);
+
+MODULE_AUTHOR("Mike Rapoport <rppt@kernel.org>");
+MODULE_DESCRIPTION("KHO test module");
+MODULE_LICENSE("GPL");
diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c
index 13e2a10d7554..a182e48b5f5e 100644
--- a/lib/test_maple_tree.c
+++ b/lib/test_maple_tree.c
@@ -2746,139 +2746,6 @@ static noinline void __init check_fuzzer(struct maple_tree *mt)
mtree_test_erase(mt, ULONG_MAX - 10);
}
-/* duplicate the tree with a specific gap */
-static noinline void __init check_dup_gaps(struct maple_tree *mt,
- unsigned long nr_entries, bool zero_start,
- unsigned long gap)
-{
- unsigned long i = 0;
- struct maple_tree newmt;
- int ret;
- void *tmp;
- MA_STATE(mas, mt, 0, 0);
- MA_STATE(newmas, &newmt, 0, 0);
- struct rw_semaphore newmt_lock;
-
- init_rwsem(&newmt_lock);
- mt_set_external_lock(&newmt, &newmt_lock);
-
- if (!zero_start)
- i = 1;
-
- mt_zero_nr_tallocated();
- for (; i <= nr_entries; i++)
- mtree_store_range(mt, i*10, (i+1)*10 - gap,
- xa_mk_value(i), GFP_KERNEL);
-
- mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN);
- mt_set_non_kernel(99999);
- down_write(&newmt_lock);
- ret = mas_expected_entries(&newmas, nr_entries);
- mt_set_non_kernel(0);
- MT_BUG_ON(mt, ret != 0);
-
- rcu_read_lock();
- mas_for_each(&mas, tmp, ULONG_MAX) {
- newmas.index = mas.index;
- newmas.last = mas.last;
- mas_store(&newmas, tmp);
- }
- rcu_read_unlock();
- mas_destroy(&newmas);
-
- __mt_destroy(&newmt);
- up_write(&newmt_lock);
-}
-
-/* Duplicate many sizes of trees. Mainly to test expected entry values */
-static noinline void __init check_dup(struct maple_tree *mt)
-{
- int i;
- int big_start = 100010;
-
- /* Check with a value at zero */
- for (i = 10; i < 1000; i++) {
- mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
- check_dup_gaps(mt, i, true, 5);
- mtree_destroy(mt);
- rcu_barrier();
- }
-
- cond_resched();
- mt_cache_shrink();
- /* Check with a value at zero, no gap */
- for (i = 1000; i < 2000; i++) {
- mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
- check_dup_gaps(mt, i, true, 0);
- mtree_destroy(mt);
- rcu_barrier();
- }
-
- cond_resched();
- mt_cache_shrink();
- /* Check with a value at zero and unreasonably large */
- for (i = big_start; i < big_start + 10; i++) {
- mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
- check_dup_gaps(mt, i, true, 5);
- mtree_destroy(mt);
- rcu_barrier();
- }
-
- cond_resched();
- mt_cache_shrink();
- /* Small to medium size not starting at zero*/
- for (i = 200; i < 1000; i++) {
- mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
- check_dup_gaps(mt, i, false, 5);
- mtree_destroy(mt);
- rcu_barrier();
- }
-
- cond_resched();
- mt_cache_shrink();
- /* Unreasonably large not starting at zero*/
- for (i = big_start; i < big_start + 10; i++) {
- mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
- check_dup_gaps(mt, i, false, 5);
- mtree_destroy(mt);
- rcu_barrier();
- cond_resched();
- mt_cache_shrink();
- }
-
- /* Check non-allocation tree not starting at zero */
- for (i = 1500; i < 3000; i++) {
- mt_init_flags(mt, 0);
- check_dup_gaps(mt, i, false, 5);
- mtree_destroy(mt);
- rcu_barrier();
- cond_resched();
- if (i % 2 == 0)
- mt_cache_shrink();
- }
-
- mt_cache_shrink();
- /* Check non-allocation tree starting at zero */
- for (i = 200; i < 1000; i++) {
- mt_init_flags(mt, 0);
- check_dup_gaps(mt, i, true, 5);
- mtree_destroy(mt);
- rcu_barrier();
- cond_resched();
- }
-
- mt_cache_shrink();
- /* Unreasonably large */
- for (i = big_start + 5; i < big_start + 10; i++) {
- mt_init_flags(mt, 0);
- check_dup_gaps(mt, i, true, 5);
- mtree_destroy(mt);
- rcu_barrier();
- mt_cache_shrink();
- cond_resched();
- }
-}
-
static noinline void __init check_bnode_min_spanning(struct maple_tree *mt)
{
int i = 50;
@@ -3177,6 +3044,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
void *entry, *ptr = (void *) 0x1234500;
void *ptr2 = &ptr;
void *ptr3 = &ptr2;
+ unsigned long index;
/* Check MAS_ROOT First */
mtree_store_range(mt, 0, 0, ptr, GFP_KERNEL);
@@ -3561,7 +3429,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
MT_BUG_ON(mt, mas.last != 0x1500);
MT_BUG_ON(mt, !mas_is_active(&mas));
- /* find: start ->active on value */;
+ /* find: start ->active on value */
mas_set(&mas, 1200);
entry = mas_find(&mas, ULONG_MAX);
MT_BUG_ON(mt, entry != ptr);
@@ -3707,6 +3575,37 @@ static noinline void __init check_state_handling(struct maple_tree *mt)
MT_BUG_ON(mt, !mas_is_active(&mas));
mas_unlock(&mas);
+ mtree_destroy(mt);
+
+ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+ mas_lock(&mas);
+ for (int count = 0; count < 30; count++) {
+ mas_set(&mas, count);
+ mas_store_gfp(&mas, xa_mk_value(count), GFP_KERNEL);
+ }
+
+ /* Ensure mas_find works with MA_UNDERFLOW */
+ mas_set(&mas, 0);
+ entry = mas_walk(&mas);
+ mas_set(&mas, 0);
+ mas_prev(&mas, 0);
+ MT_BUG_ON(mt, mas.status != ma_underflow);
+ MT_BUG_ON(mt, mas_find(&mas, ULONG_MAX) != entry);
+
+ /* Restore active on mas_next */
+ entry = mas_next(&mas, ULONG_MAX);
+ index = mas.index;
+ mas_prev(&mas, index);
+ MT_BUG_ON(mt, mas.status != ma_underflow);
+ MT_BUG_ON(mt, mas_next(&mas, ULONG_MAX) != entry);
+
+ /* Ensure overflow -> active works */
+ mas_prev(&mas, 0);
+ mas_next(&mas, index - 1);
+ MT_BUG_ON(mt, mas.status != ma_overflow);
+ MT_BUG_ON(mt, mas_next(&mas, ULONG_MAX) != entry);
+
+ mas_unlock(&mas);
}
static noinline void __init alloc_cyclic_testing(struct maple_tree *mt)
@@ -4046,10 +3945,6 @@ static int __init maple_tree_seed(void)
mtree_destroy(&tree);
mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
- check_dup(&tree);
- mtree_destroy(&tree);
-
- mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
check_bnode_min_spanning(&tree);
mtree_destroy(&tree);
diff --git a/lib/test_objagg.c b/lib/test_objagg.c
index d34df4306b87..ce5c4c36a084 100644
--- a/lib/test_objagg.c
+++ b/lib/test_objagg.c
@@ -899,57 +899,31 @@ static int check_expect_hints_stats(struct objagg_hints *objagg_hints,
int err;
stats = objagg_hints_stats_get(objagg_hints);
- if (IS_ERR(stats))
+ if (IS_ERR(stats)) {
+ *errmsg = "objagg_hints_stats_get() failed.";
return PTR_ERR(stats);
+ }
err = __check_expect_stats(stats, expect_stats, errmsg);
objagg_stats_put(stats);
return err;
}
-static int test_hints_case(const struct hints_case *hints_case)
+static int test_hints_case2(const struct hints_case *hints_case,
+ struct objagg_hints *hints, struct objagg *objagg)
{
struct objagg_obj *objagg_obj;
- struct objagg_hints *hints;
struct world world2 = {};
- struct world world = {};
struct objagg *objagg2;
- struct objagg *objagg;
const char *errmsg;
int i;
int err;
- objagg = objagg_create(&delta_ops, NULL, &world);
- if (IS_ERR(objagg))
- return PTR_ERR(objagg);
-
- for (i = 0; i < hints_case->key_ids_count; i++) {
- objagg_obj = world_obj_get(&world, objagg,
- hints_case->key_ids[i]);
- if (IS_ERR(objagg_obj)) {
- err = PTR_ERR(objagg_obj);
- goto err_world_obj_get;
- }
- }
-
- pr_debug_stats(objagg);
- err = check_expect_stats(objagg, &hints_case->expect_stats, &errmsg);
- if (err) {
- pr_err("Stats: %s\n", errmsg);
- goto err_check_expect_stats;
- }
-
- hints = objagg_hints_get(objagg, OBJAGG_OPT_ALGO_SIMPLE_GREEDY);
- if (IS_ERR(hints)) {
- err = PTR_ERR(hints);
- goto err_hints_get;
- }
-
pr_debug_hints_stats(hints);
err = check_expect_hints_stats(hints, &hints_case->expect_stats_hints,
&errmsg);
if (err) {
pr_err("Hints stats: %s\n", errmsg);
- goto err_check_expect_hints_stats;
+ return err;
}
objagg2 = objagg_create(&delta_ops, hints, &world2);
@@ -981,7 +955,48 @@ err_world2_obj_get:
world_obj_put(&world2, objagg, hints_case->key_ids[i]);
i = hints_case->key_ids_count;
objagg_destroy(objagg2);
-err_check_expect_hints_stats:
+
+ return err;
+}
+
+static int test_hints_case(const struct hints_case *hints_case)
+{
+ struct objagg_obj *objagg_obj;
+ struct objagg_hints *hints;
+ struct world world = {};
+ struct objagg *objagg;
+ const char *errmsg;
+ int i;
+ int err;
+
+ objagg = objagg_create(&delta_ops, NULL, &world);
+ if (IS_ERR(objagg))
+ return PTR_ERR(objagg);
+
+ for (i = 0; i < hints_case->key_ids_count; i++) {
+ objagg_obj = world_obj_get(&world, objagg,
+ hints_case->key_ids[i]);
+ if (IS_ERR(objagg_obj)) {
+ err = PTR_ERR(objagg_obj);
+ goto err_world_obj_get;
+ }
+ }
+
+ pr_debug_stats(objagg);
+ err = check_expect_stats(objagg, &hints_case->expect_stats, &errmsg);
+ if (err) {
+ pr_err("Stats: %s\n", errmsg);
+ goto err_check_expect_stats;
+ }
+
+ hints = objagg_hints_get(objagg, OBJAGG_OPT_ALGO_SIMPLE_GREEDY);
+ if (IS_ERR(hints)) {
+ err = PTR_ERR(hints);
+ goto err_hints_get;
+ }
+
+ err = test_hints_case2(hints_case, hints, objagg);
+
objagg_hints_put(hints);
err_hints_get:
err_check_expect_stats:
diff --git a/lib/test_objpool.c b/lib/test_objpool.c
index 8f688187fa87..6a34a7582fdb 100644
--- a/lib/test_objpool.c
+++ b/lib/test_objpool.c
@@ -164,7 +164,7 @@ static enum hrtimer_restart ot_hrtimer_handler(struct hrtimer *hrt)
/* do bulk-testings for objects pop/push */
item->worker(item, 1);
- hrtimer_forward(hrt, hrt->base->get_time(), item->hrtcycle);
+ hrtimer_forward_now(hrt, item->hrtcycle);
return HRTIMER_RESTART;
}
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index 1b0b59549aaf..2815658ccc37 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -41,7 +41,7 @@ __param(int, nr_pages, 0,
__param(bool, use_huge, false,
"Use vmalloc_huge in fix_size_alloc_test");
-__param(int, run_test_mask, INT_MAX,
+__param(int, run_test_mask, 7,
"Set tests specified in the mask.\n\n"
"\t\tid: 1, name: fix_size_alloc_test\n"
"\t\tid: 2, name: full_fit_alloc_test\n"
@@ -396,25 +396,27 @@ cleanup:
struct test_case_desc {
const char *test_name;
int (*test_func)(void);
+ bool xfail;
};
static struct test_case_desc test_case_array[] = {
- { "fix_size_alloc_test", fix_size_alloc_test },
- { "full_fit_alloc_test", full_fit_alloc_test },
- { "long_busy_list_alloc_test", long_busy_list_alloc_test },
- { "random_size_alloc_test", random_size_alloc_test },
- { "fix_align_alloc_test", fix_align_alloc_test },
- { "random_size_align_alloc_test", random_size_align_alloc_test },
- { "align_shift_alloc_test", align_shift_alloc_test },
- { "pcpu_alloc_test", pcpu_alloc_test },
- { "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test },
- { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test },
- { "vm_map_ram_test", vm_map_ram_test },
+ { "fix_size_alloc_test", fix_size_alloc_test, },
+ { "full_fit_alloc_test", full_fit_alloc_test, },
+ { "long_busy_list_alloc_test", long_busy_list_alloc_test, },
+ { "random_size_alloc_test", random_size_alloc_test, },
+ { "fix_align_alloc_test", fix_align_alloc_test, },
+ { "random_size_align_alloc_test", random_size_align_alloc_test, },
+ { "align_shift_alloc_test", align_shift_alloc_test, true },
+ { "pcpu_alloc_test", pcpu_alloc_test, },
+ { "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test, },
+ { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test, },
+ { "vm_map_ram_test", vm_map_ram_test, },
/* Add a new test case here. */
};
struct test_case_data {
int test_failed;
+ int test_xfailed;
int test_passed;
u64 time;
};
@@ -444,7 +446,7 @@ static int test_func(void *private)
{
struct test_driver *t = private;
int random_array[ARRAY_SIZE(test_case_array)];
- int index, i, j;
+ int index, i, j, ret;
ktime_t kt;
u64 delta;
@@ -468,11 +470,14 @@ static int test_func(void *private)
*/
if (!((run_test_mask & (1 << index)) >> index))
continue;
-
kt = ktime_get();
for (j = 0; j < test_repeat_count; j++) {
- if (!test_case_array[index].test_func())
+ ret = test_case_array[index].test_func();
+
+ if (!ret && !test_case_array[index].xfail)
t->data[index].test_passed++;
+ else if (ret && test_case_array[index].xfail)
+ t->data[index].test_xfailed++;
else
t->data[index].test_failed++;
}
@@ -576,10 +581,11 @@ static void do_concurrent_test(void)
continue;
pr_info(
- "Summary: %s passed: %d failed: %d repeat: %d loops: %d avg: %llu usec\n",
+ "Summary: %s passed: %d failed: %d xfailed: %d repeat: %d loops: %d avg: %llu usec\n",
test_case_array[j].test_name,
t->data[j].test_passed,
t->data[j].test_failed,
+ t->data[j].test_xfailed,
test_repeat_count, test_loop_count,
t->data[j].time);
}
@@ -598,7 +604,11 @@ static int __init vmalloc_test_init(void)
return IS_BUILTIN(CONFIG_TEST_VMALLOC) ? 0:-EAGAIN;
}
+#ifdef MODULE
module_init(vmalloc_test_init)
+#else
+late_initcall(vmalloc_test_init);
+#endif
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Uladzislau Rezki");
diff --git a/lib/tests/Makefile b/lib/tests/Makefile
index 56d645014482..f7460831cfdd 100644
--- a/lib/tests/Makefile
+++ b/lib/tests/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_BLACKHOLE_DEV_KUNIT_TEST) += blackhole_dev_kunit.o
obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o
obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o
obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o
-obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o
+obj-$(CONFIG_FFS_KUNIT_TEST) += ffs_kunit.o
CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced)
CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread)
CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation)
@@ -37,6 +37,7 @@ obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o
obj-$(CONFIG_PRINTF_KUNIT_TEST) += printf_kunit.o
obj-$(CONFIG_RANDSTRUCT_KUNIT_TEST) += randstruct_kunit.o
obj-$(CONFIG_SCANF_KUNIT_TEST) += scanf_kunit.o
+obj-$(CONFIG_SEQ_BUF_KUNIT_TEST) += seq_buf_kunit.o
obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o
obj-$(CONFIG_SLUB_KUNIT_TEST) += slub_kunit.o
obj-$(CONFIG_TEST_SORT) += test_sort.o
@@ -46,5 +47,6 @@ obj-$(CONFIG_STRING_KUNIT_TEST) += string_kunit.o
obj-$(CONFIG_STRING_HELPERS_KUNIT_TEST) += string_helpers_kunit.o
obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o
obj-$(CONFIG_UTIL_MACROS_KUNIT) += util_macros_kunit.o
+obj-$(CONFIG_RATELIMIT_KUNIT_TEST) += test_ratelimit.o
obj-$(CONFIG_TEST_RUNTIME_MODULE) += module/
diff --git a/lib/tests/ffs_kunit.c b/lib/tests/ffs_kunit.c
new file mode 100644
index 000000000000..9a329cdc09c2
--- /dev/null
+++ b/lib/tests/ffs_kunit.c
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KUnit tests for ffs()-family functions
+ */
+#include <kunit/test.h>
+#include <linux/bitops.h>
+
+/*
+ * Test data structures
+ */
+struct ffs_test_case {
+ unsigned long input;
+ int expected_ffs; /* ffs() result (1-based) */
+ int expected_fls; /* fls() result (1-based) */
+ const char *description;
+};
+
+struct ffs64_test_case {
+ u64 input;
+ int expected_fls64; /* fls64() result (1-based) */
+ unsigned int expected_ffs64_0based; /* __ffs64() result (0-based) */
+ const char *description;
+};
+
+/*
+ * Basic edge cases - core functionality validation
+ */
+static const struct ffs_test_case basic_test_cases[] = {
+ /* Zero case - special handling */
+ {0x00000000, 0, 0, "zero value"},
+
+ /* Single bit patterns - powers of 2 */
+ {0x00000001, 1, 1, "bit 0 set"},
+ {0x00000002, 2, 2, "bit 1 set"},
+ {0x00000004, 3, 3, "bit 2 set"},
+ {0x00000008, 4, 4, "bit 3 set"},
+ {0x00000010, 5, 5, "bit 4 set"},
+ {0x00000020, 6, 6, "bit 5 set"},
+ {0x00000040, 7, 7, "bit 6 set"},
+ {0x00000080, 8, 8, "bit 7 set"},
+ {0x00000100, 9, 9, "bit 8 set"},
+ {0x00008000, 16, 16, "bit 15 set"},
+ {0x00010000, 17, 17, "bit 16 set"},
+ {0x40000000, 31, 31, "bit 30 set"},
+ {0x80000000, 32, 32, "bit 31 set (sign bit)"},
+
+ /* Maximum values */
+ {0xFFFFFFFF, 1, 32, "all bits set"},
+
+ /* Multiple bit patterns */
+ {0x00000003, 1, 2, "bits 0-1 set"},
+ {0x00000007, 1, 3, "bits 0-2 set"},
+ {0x0000000F, 1, 4, "bits 0-3 set"},
+ {0x000000FF, 1, 8, "bits 0-7 set"},
+ {0x0000FFFF, 1, 16, "bits 0-15 set"},
+ {0x7FFFFFFF, 1, 31, "bits 0-30 set"},
+
+ /* Sparse patterns */
+ {0x00000101, 1, 9, "bits 0,8 set"},
+ {0x00001001, 1, 13, "bits 0,12 set"},
+ {0x80000001, 1, 32, "bits 0,31 set"},
+ {0x40000002, 2, 31, "bits 1,30 set"},
+};
+
+/*
+ * 64-bit test cases
+ */
+static const struct ffs64_test_case ffs64_test_cases[] = {
+ /* Zero case */
+ {0x0000000000000000ULL, 0, 0, "zero value"},
+
+ /* Single bit patterns */
+ {0x0000000000000001ULL, 1, 0, "bit 0 set"},
+ {0x0000000000000002ULL, 2, 1, "bit 1 set"},
+ {0x0000000000000004ULL, 3, 2, "bit 2 set"},
+ {0x0000000000000008ULL, 4, 3, "bit 3 set"},
+ {0x0000000000008000ULL, 16, 15, "bit 15 set"},
+ {0x0000000000010000ULL, 17, 16, "bit 16 set"},
+ {0x0000000080000000ULL, 32, 31, "bit 31 set"},
+ {0x0000000100000000ULL, 33, 32, "bit 32 set"},
+ {0x0000000200000000ULL, 34, 33, "bit 33 set"},
+ {0x4000000000000000ULL, 63, 62, "bit 62 set"},
+ {0x8000000000000000ULL, 64, 63, "bit 63 set (sign bit)"},
+
+ /* Maximum values */
+ {0xFFFFFFFFFFFFFFFFULL, 64, 0, "all bits set"},
+
+ /* Cross 32-bit boundary patterns */
+ {0x00000000FFFFFFFFULL, 32, 0, "lower 32 bits set"},
+ {0xFFFFFFFF00000000ULL, 64, 32, "upper 32 bits set"},
+ {0x8000000000000001ULL, 64, 0, "bits 0,63 set"},
+ {0x4000000000000002ULL, 63, 1, "bits 1,62 set"},
+
+ /* Mixed patterns */
+ {0x00000001FFFFFFFFULL, 33, 0, "bit 32 + lower 32 bits"},
+ {0xFFFFFFFF80000000ULL, 64, 31, "upper 32 bits + bit 31"},
+};
+
+/*
+ * Helper function to validate ffs results with detailed error messages
+ */
+static void validate_ffs_result(struct kunit *test, unsigned long input,
+ int actual, int expected, const char *func_name,
+ const char *description)
+{
+ KUNIT_EXPECT_EQ_MSG(test, actual, expected,
+ "%s(0x%08lx) [%s]: expected %d, got %d",
+ func_name, input, description, expected, actual);
+}
+
+/*
+ * Helper function to validate 64-bit ffs results
+ */
+static void validate_ffs64_result(struct kunit *test, u64 input,
+ int actual, int expected, const char *func_name,
+ const char *description)
+{
+ KUNIT_EXPECT_EQ_MSG(test, actual, expected,
+ "%s(0x%016llx) [%s]: expected %d, got %d",
+ func_name, input, description, expected, actual);
+}
+
+/*
+ * Helper function to validate mathematical relationships between functions
+ */
+static void validate_ffs_relationships(struct kunit *test, unsigned long input)
+{
+ int ffs_result;
+ int fls_result;
+ unsigned int ffs_0based;
+ unsigned int fls_0based;
+
+ if (input == 0) {
+ /* Special case: zero input */
+ KUNIT_EXPECT_EQ(test, ffs(input), 0);
+ KUNIT_EXPECT_EQ(test, fls(input), 0);
+ /* __ffs and __fls are undefined for 0, but often return specific values */
+ return;
+ }
+
+ ffs_result = ffs(input);
+ fls_result = fls(input);
+ ffs_0based = __ffs(input);
+ fls_0based = __fls(input);
+
+ /* Relationship: ffs(x) == __ffs(x) + 1 for x != 0 */
+ KUNIT_EXPECT_EQ_MSG(test, ffs_result, ffs_0based + 1,
+ "ffs(0x%08lx) != __ffs(0x%08lx) + 1: %d != %u + 1",
+ input, input, ffs_result, ffs_0based);
+
+ /* Relationship: fls(x) == __fls(x) + 1 for x != 0 */
+ KUNIT_EXPECT_EQ_MSG(test, fls_result, fls_0based + 1,
+ "fls(0x%08lx) != __fls(0x%08lx) + 1: %d != %u + 1",
+ input, input, fls_result, fls_0based);
+
+ /* Range validation */
+ KUNIT_EXPECT_GE(test, ffs_result, 1);
+ KUNIT_EXPECT_LE(test, ffs_result, BITS_PER_LONG);
+ KUNIT_EXPECT_GE(test, fls_result, 1);
+ KUNIT_EXPECT_LE(test, fls_result, BITS_PER_LONG);
+}
+
+/*
+ * Helper function to validate 64-bit relationships
+ */
+static void validate_ffs64_relationships(struct kunit *test, u64 input)
+{
+ int fls64_result;
+ unsigned int ffs64_0based;
+
+ if (input == 0) {
+ KUNIT_EXPECT_EQ(test, fls64(input), 0);
+ return;
+ }
+
+ fls64_result = fls64(input);
+ ffs64_0based = __ffs64(input);
+
+ /* Range validation */
+ KUNIT_EXPECT_GE(test, fls64_result, 1);
+ KUNIT_EXPECT_LE(test, fls64_result, 64);
+ KUNIT_EXPECT_LT(test, ffs64_0based, 64);
+
+ /*
+ * Relationships with 32-bit functions should hold for small values
+ * on all architectures.
+ */
+ if (input <= 0xFFFFFFFFULL) {
+ unsigned long input_32 = (unsigned long)input;
+ KUNIT_EXPECT_EQ_MSG(test, fls64(input), fls(input_32),
+ "fls64(0x%llx) != fls(0x%lx): %d != %d",
+ input, input_32, fls64(input), fls(input_32));
+
+ if (input != 0) {
+ KUNIT_EXPECT_EQ_MSG(test, __ffs64(input), __ffs(input_32),
+ "__ffs64(0x%llx) != __ffs(0x%lx): %lu != %lu",
+ input, input_32,
+ (unsigned long)__ffs64(input),
+ (unsigned long)__ffs(input_32));
+ }
+ }
+}
+
+/*
+ * Test basic correctness of all ffs-family functions
+ */
+static void ffs_basic_correctness_test(struct kunit *test)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) {
+ const struct ffs_test_case *tc = &basic_test_cases[i];
+
+ /* Test ffs() */
+ validate_ffs_result(test, tc->input, ffs(tc->input),
+ tc->expected_ffs, "ffs", tc->description);
+
+ /* Test fls() */
+ validate_ffs_result(test, tc->input, fls(tc->input),
+ tc->expected_fls, "fls", tc->description);
+
+ /* Test __ffs() - skip zero case as it's undefined */
+ if (tc->input != 0) {
+ /* Calculate expected __ffs() result: __ffs(x) == ffs(x) - 1 */
+ unsigned int expected_ffs_0based = tc->expected_ffs - 1;
+ validate_ffs_result(test, tc->input, __ffs(tc->input),
+ expected_ffs_0based, "__ffs", tc->description);
+ }
+
+ /* Test __fls() - skip zero case as it's undefined */
+ if (tc->input != 0) {
+ /* Calculate expected __fls() result: __fls(x) == fls(x) - 1 */
+ unsigned int expected_fls_0based = tc->expected_fls - 1;
+ validate_ffs_result(test, tc->input, __fls(tc->input),
+ expected_fls_0based, "__fls", tc->description);
+ }
+ }
+}
+
+/*
+ * Test 64-bit function correctness
+ */
+static void ffs64_correctness_test(struct kunit *test)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) {
+ const struct ffs64_test_case *tc = &ffs64_test_cases[i];
+
+ /* Test fls64() */
+ validate_ffs64_result(test, tc->input, fls64(tc->input),
+ tc->expected_fls64, "fls64", tc->description);
+
+ /* Test __ffs64() - skip zero case as it's undefined */
+ if (tc->input != 0) {
+ validate_ffs64_result(test, tc->input, __ffs64(tc->input),
+ tc->expected_ffs64_0based, "__ffs64",
+ tc->description);
+ }
+ }
+}
+
+/*
+ * Test mathematical relationships between functions
+ */
+static void ffs_mathematical_relationships_test(struct kunit *test)
+{
+ int i;
+
+ /* Test basic cases */
+ for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) {
+ validate_ffs_relationships(test, basic_test_cases[i].input);
+ }
+
+ /* Test 64-bit cases */
+ for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) {
+ validate_ffs64_relationships(test, ffs64_test_cases[i].input);
+ }
+}
+
+/*
+ * Test edge cases and boundary conditions
+ */
+static void ffs_edge_cases_test(struct kunit *test)
+{
+ unsigned long test_patterns[] = {
+ /* Powers of 2 */
+ 1UL, 2UL, 4UL, 8UL, 16UL, 32UL, 64UL, 128UL,
+ 256UL, 512UL, 1024UL, 2048UL, 4096UL, 8192UL,
+
+ /* Powers of 2 minus 1 */
+ 1UL, 3UL, 7UL, 15UL, 31UL, 63UL, 127UL, 255UL,
+ 511UL, 1023UL, 2047UL, 4095UL, 8191UL,
+
+ /* Boundary values */
+ 0x7FFFFFFFUL, /* Maximum positive 32-bit */
+ 0x80000000UL, /* Minimum negative 32-bit */
+ 0xFFFFFFFFUL, /* Maximum 32-bit unsigned */
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_patterns); i++) {
+ validate_ffs_relationships(test, test_patterns[i]);
+ }
+}
+
+/*
+ * Test 64-bit edge cases
+ */
+static void ffs64_edge_cases_test(struct kunit *test)
+{
+ u64 test_patterns_64[] = {
+ /* 64-bit powers of 2 */
+ 0x0000000100000000ULL, /* 2^32 */
+ 0x0000000200000000ULL, /* 2^33 */
+ 0x0000000400000000ULL, /* 2^34 */
+ 0x0000001000000000ULL, /* 2^36 */
+ 0x0000010000000000ULL, /* 2^40 */
+ 0x0001000000000000ULL, /* 2^48 */
+ 0x0100000000000000ULL, /* 2^56 */
+ 0x4000000000000000ULL, /* 2^62 */
+ 0x8000000000000000ULL, /* 2^63 */
+
+ /* Cross-boundary patterns */
+ 0x00000000FFFFFFFFULL, /* Lower 32 bits */
+ 0xFFFFFFFF00000000ULL, /* Upper 32 bits */
+ 0x7FFFFFFFFFFFFFFFULL, /* Maximum positive 64-bit */
+ 0xFFFFFFFFFFFFFFFFULL, /* Maximum 64-bit unsigned */
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_patterns_64); i++) {
+ validate_ffs64_relationships(test, test_patterns_64[i]);
+ }
+}
+
+/*
+ * ffz() test data - Find First Zero bit test cases
+ */
+struct ffz_test_case {
+ unsigned long input;
+ unsigned long expected_ffz;
+ const char *description;
+};
+
+static const struct ffz_test_case ffz_test_cases[] = {
+ /* Zero bits in specific positions */
+ {0xFFFFFFFE, 0, "bit 0 is zero"}, /* ...11111110 */
+ {0xFFFFFFFD, 1, "bit 1 is zero"}, /* ...11111101 */
+ {0xFFFFFFFB, 2, "bit 2 is zero"}, /* ...11111011 */
+ {0xFFFFFFF7, 3, "bit 3 is zero"}, /* ...11110111 */
+ {0xFFFFFFEF, 4, "bit 4 is zero"}, /* ...11101111 */
+ {0xFFFFFFDF, 5, "bit 5 is zero"}, /* ...11011111 */
+ {0xFFFFFFBF, 6, "bit 6 is zero"}, /* ...10111111 */
+ {0xFFFFFF7F, 7, "bit 7 is zero"}, /* ...01111111 */
+ {0xFFFFFEFF, 8, "bit 8 is zero"}, /* Gap in bit 8 */
+ {0xFFFF7FFF, 15, "bit 15 is zero"}, /* Gap in bit 15 */
+ {0xFFFEFFFF, 16, "bit 16 is zero"}, /* Gap in bit 16 */
+ {0xBFFFFFFF, 30, "bit 30 is zero"}, /* Gap in bit 30 */
+ {0x7FFFFFFF, 31, "bit 31 is zero"}, /* 01111111... */
+
+ /* Multiple zero patterns */
+ {0xFFFFFFFC, 0, "bits 0-1 are zero"}, /* ...11111100 */
+ {0xFFFFFFF8, 0, "bits 0-2 are zero"}, /* ...11111000 */
+ {0xFFFFFFF0, 0, "bits 0-3 are zero"}, /* ...11110000 */
+ {0xFFFFFF00, 0, "bits 0-7 are zero"}, /* ...00000000 */
+ {0xFFFF0000, 0, "bits 0-15 are zero"}, /* Lower 16 bits zero */
+
+ /* All zeros (special case) */
+ {0x00000000, 0, "all bits zero"},
+
+ /* Complex patterns */
+ {0xFFFDFFFF, 17, "bit 17 is zero"}, /* Gap in bit 17 */
+ {0xFFF7FFFF, 19, "bit 19 is zero"}, /* Gap in bit 19 */
+ {0xF7FFFFFF, 27, "bit 27 is zero"}, /* Gap in bit 27 */
+ {0xDFFFFFFF, 29, "bit 29 is zero"}, /* Gap in bit 29 */
+};
+
+/*
+ * Test basic correctness of ffz() function
+ */
+static void ffz_basic_correctness_test(struct kunit *test)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) {
+ const struct ffz_test_case *tc = &ffz_test_cases[i];
+ unsigned long result = ffz(tc->input);
+
+ KUNIT_EXPECT_EQ_MSG(test, result, tc->expected_ffz,
+ "ffz(0x%08lx) [%s]: expected %lu, got %lu",
+ tc->input, tc->description, tc->expected_ffz, result);
+ }
+}
+
+/*
+ * Test mathematical relationships between ffz() and other functions
+ */
+static void validate_ffz_relationships(struct kunit *test, unsigned long input)
+{
+ unsigned long ffz_result;
+
+ if (input == 0) {
+ /* ffz(0) should return 0 (first zero bit is at position 0) */
+ KUNIT_EXPECT_EQ(test, ffz(input), 0);
+ return;
+ }
+
+ if (input == ~0UL) {
+ /* ffz(~0) is undefined (no zero bits) - just verify it doesn't crash */
+ ffz_result = ffz(input);
+ /* Implementation-defined behavior, just ensure it completes */
+ return;
+ }
+
+ ffz_result = ffz(input);
+
+ /* Range validation - result should be within valid bit range */
+ KUNIT_EXPECT_LT(test, ffz_result, BITS_PER_LONG);
+
+ /* Verify the bit at ffz_result position is actually zero */
+ KUNIT_EXPECT_EQ_MSG(test, (input >> ffz_result) & 1, 0,
+ "ffz(0x%08lx) = %lu, but bit %lu is not zero",
+ input, ffz_result, ffz_result);
+
+ /* Core relationship: if we set the ffz bit, ffz should find a different bit */
+ if (ffz_result < BITS_PER_LONG - 1) {
+ unsigned long modified = input | (1UL << ffz_result);
+ if (modified != ~0UL) { /* Skip if all bits would be set */
+ unsigned long new_ffz = ffz(modified);
+ KUNIT_EXPECT_NE_MSG(test, new_ffz, ffz_result,
+ "ffz(0x%08lx) = %lu, but setting that bit doesn't change ffz result",
+ input, ffz_result);
+ }
+ }
+}
+
+static void ffz_mathematical_relationships_test(struct kunit *test)
+{
+ unsigned long test_patterns[] = {
+ /* Powers of 2 with one bit clear */
+ 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7,
+ 0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F,
+
+ /* Multiple patterns */
+ 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000,
+ 0x7FFFFFFF, 0x3FFFFFFF, 0x1FFFFFFF, 0x0FFFFFFF,
+
+ /* Complex bit patterns */
+ 0xAAAAAAAA, 0x55555555, 0xCCCCCCCC, 0x33333333,
+ 0xF0F0F0F0, 0x0F0F0F0F, 0xFF00FF00, 0x00FF00FF,
+ };
+ int i;
+
+ /* Test basic test cases */
+ for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) {
+ validate_ffz_relationships(test, ffz_test_cases[i].input);
+ }
+
+ /* Test additional patterns */
+ for (i = 0; i < ARRAY_SIZE(test_patterns); i++) {
+ validate_ffz_relationships(test, test_patterns[i]);
+ }
+}
+
+/*
+ * Test edge cases and boundary conditions for ffz()
+ */
+static void ffz_edge_cases_test(struct kunit *test)
+{
+ unsigned long edge_patterns[] = {
+ /* Boundary values */
+ 0x00000000, /* All zeros */
+ 0x80000000, /* Only MSB set */
+ 0x00000001, /* Only LSB set */
+ 0x7FFFFFFF, /* MSB clear */
+ 0xFFFFFFFE, /* LSB clear */
+
+ /* Powers of 2 complement patterns (one zero bit each) */
+ ~(1UL << 0), ~(1UL << 1), ~(1UL << 2), ~(1UL << 3),
+ ~(1UL << 4), ~(1UL << 8), ~(1UL << 16), ~(1UL << 31),
+
+ /* Walking zero patterns */
+ 0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7,
+ 0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F,
+ 0xFFFFFEFF, 0xFFFFFDFF, 0xFFFFFBFF, 0xFFFFF7FF,
+
+ /* Multiple zeros */
+ 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000,
+ 0xFF000000, 0xF0000000, 0x00000000,
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(edge_patterns); i++) {
+ validate_ffz_relationships(test, edge_patterns[i]);
+ }
+}
+
+/*
+ * To have useful build error output, split the tests into separate
+ * functions so it's clear which are missing __attribute_const__.
+ */
+#define CREATE_WRAPPER(func) \
+static noinline bool build_test_##func(void) \
+{ \
+ int init_##func = 32; \
+ int result_##func = func(6); \
+ \
+ /* Does the static initializer vanish after calling func? */ \
+ BUILD_BUG_ON(init_##func < 32); \
+ \
+ /* "Consume" the results so optimizer doesn't drop them. */ \
+ barrier_data(&init_##func); \
+ barrier_data(&result_##func); \
+ \
+ return true; \
+}
+CREATE_WRAPPER(ffs)
+CREATE_WRAPPER(fls)
+CREATE_WRAPPER(__ffs)
+CREATE_WRAPPER(__fls)
+CREATE_WRAPPER(ffz)
+#undef CREATE_WRAPPER
+
+/*
+ * Make sure that __attribute_const__ has be applied to all the
+ * functions. This is a regression test for:
+ * https://github.com/KSPP/linux/issues/364
+ */
+static void ffs_attribute_const_test(struct kunit *test)
+{
+ KUNIT_EXPECT_TRUE(test, build_test_ffs());
+ KUNIT_EXPECT_TRUE(test, build_test_fls());
+ KUNIT_EXPECT_TRUE(test, build_test___ffs());
+ KUNIT_EXPECT_TRUE(test, build_test___fls());
+ KUNIT_EXPECT_TRUE(test, build_test_ffz());
+}
+
+/*
+ * KUnit test case definitions
+ */
+static struct kunit_case ffs_test_cases[] = {
+ KUNIT_CASE(ffs_basic_correctness_test),
+ KUNIT_CASE(ffs64_correctness_test),
+ KUNIT_CASE(ffs_mathematical_relationships_test),
+ KUNIT_CASE(ffs_edge_cases_test),
+ KUNIT_CASE(ffs64_edge_cases_test),
+ KUNIT_CASE(ffz_basic_correctness_test),
+ KUNIT_CASE(ffz_mathematical_relationships_test),
+ KUNIT_CASE(ffz_edge_cases_test),
+ KUNIT_CASE(ffs_attribute_const_test),
+ {}
+};
+
+/*
+ * KUnit test suite definition
+ */
+static struct kunit_suite ffs_test_suite = {
+ .name = "ffs",
+ .test_cases = ffs_test_cases,
+};
+
+kunit_test_suites(&ffs_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests for ffs()-family functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/tests/fortify_kunit.c b/lib/tests/fortify_kunit.c
index 29ffc62a71e3..fc9c76f026d6 100644
--- a/lib/tests/fortify_kunit.c
+++ b/lib/tests/fortify_kunit.c
@@ -1003,8 +1003,8 @@ static void fortify_test_memcmp(struct kunit *test)
{
char one[] = "My mind is going ...";
char two[] = "My mind is going ... I can feel it.";
- size_t one_len = sizeof(one) - 1;
- size_t two_len = sizeof(two) - 1;
+ volatile size_t one_len = sizeof(one) - 1;
+ volatile size_t two_len = sizeof(two) - 1;
OPTIMIZER_HIDE_VAR(one_len);
OPTIMIZER_HIDE_VAR(two_len);
diff --git a/lib/tests/longest_symbol_kunit.c b/lib/tests/longest_symbol_kunit.c
index e3c28ff1807f..9b4de3050ba7 100644
--- a/lib/tests/longest_symbol_kunit.c
+++ b/lib/tests/longest_symbol_kunit.c
@@ -3,8 +3,7 @@
* Test the longest symbol length. Execute with:
* ./tools/testing/kunit/kunit.py run longest-symbol
* --arch=x86_64 --kconfig_add CONFIG_KPROBES=y --kconfig_add CONFIG_MODULES=y
- * --kconfig_add CONFIG_RETPOLINE=n --kconfig_add CONFIG_CFI_CLANG=n
- * --kconfig_add CONFIG_MITIGATION_RETPOLINE=n
+ * --kconfig_add CONFIG_CPU_MITIGATIONS=n --kconfig_add CONFIG_GCOV_KERNEL=n
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/lib/tests/seq_buf_kunit.c b/lib/tests/seq_buf_kunit.c
new file mode 100644
index 000000000000..8a01579a978e
--- /dev/null
+++ b/lib/tests/seq_buf_kunit.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit tests for the seq_buf API
+ *
+ * Copyright (C) 2025, Google LLC.
+ */
+
+#include <kunit/test.h>
+#include <linux/seq_buf.h>
+
+static void seq_buf_init_test(struct kunit *test)
+{
+ char buf[32];
+ struct seq_buf s;
+
+ seq_buf_init(&s, buf, sizeof(buf));
+
+ KUNIT_EXPECT_EQ(test, s.size, 32);
+ KUNIT_EXPECT_EQ(test, s.len, 0);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_EQ(test, seq_buf_buffer_left(&s), 32);
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 0);
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "");
+}
+
+static void seq_buf_declare_test(struct kunit *test)
+{
+ DECLARE_SEQ_BUF(s, 24);
+
+ KUNIT_EXPECT_EQ(test, s.size, 24);
+ KUNIT_EXPECT_EQ(test, s.len, 0);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_EQ(test, seq_buf_buffer_left(&s), 24);
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 0);
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "");
+}
+
+static void seq_buf_clear_test(struct kunit *test)
+{
+ DECLARE_SEQ_BUF(s, 128);
+
+ seq_buf_puts(&s, "hello");
+ KUNIT_EXPECT_EQ(test, s.len, 5);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello");
+
+ seq_buf_clear(&s);
+
+ KUNIT_EXPECT_EQ(test, s.len, 0);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "");
+}
+
+static void seq_buf_puts_test(struct kunit *test)
+{
+ DECLARE_SEQ_BUF(s, 16);
+
+ seq_buf_puts(&s, "hello");
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 5);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello");
+
+ seq_buf_puts(&s, " world");
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 11);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello world");
+}
+
+static void seq_buf_puts_overflow_test(struct kunit *test)
+{
+ DECLARE_SEQ_BUF(s, 10);
+
+ seq_buf_puts(&s, "123456789");
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 9);
+
+ seq_buf_puts(&s, "0");
+ KUNIT_EXPECT_TRUE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 10);
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "123456789");
+
+ seq_buf_clear(&s);
+ KUNIT_EXPECT_EQ(test, s.len, 0);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "");
+}
+
+static void seq_buf_putc_test(struct kunit *test)
+{
+ DECLARE_SEQ_BUF(s, 4);
+
+ seq_buf_putc(&s, 'a');
+ seq_buf_putc(&s, 'b');
+ seq_buf_putc(&s, 'c');
+
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 3);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "abc");
+
+ seq_buf_putc(&s, 'd');
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 4);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "abc");
+
+ seq_buf_putc(&s, 'e');
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 4);
+ KUNIT_EXPECT_TRUE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "abc");
+
+ seq_buf_clear(&s);
+ KUNIT_EXPECT_EQ(test, s.len, 0);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "");
+}
+
+static void seq_buf_printf_test(struct kunit *test)
+{
+ DECLARE_SEQ_BUF(s, 32);
+
+ seq_buf_printf(&s, "hello %s", "world");
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 11);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello world");
+
+ seq_buf_printf(&s, " %d", 123);
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 15);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello world 123");
+}
+
+static void seq_buf_printf_overflow_test(struct kunit *test)
+{
+ DECLARE_SEQ_BUF(s, 16);
+
+ seq_buf_printf(&s, "%lu", 1234567890UL);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 10);
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "1234567890");
+
+ seq_buf_printf(&s, "%s", "abcdefghij");
+ KUNIT_EXPECT_TRUE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 16);
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "1234567890abcde");
+
+ seq_buf_clear(&s);
+ KUNIT_EXPECT_EQ(test, s.len, 0);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "");
+}
+
+static void seq_buf_get_buf_commit_test(struct kunit *test)
+{
+ DECLARE_SEQ_BUF(s, 16);
+ char *buf;
+ size_t len;
+
+ len = seq_buf_get_buf(&s, &buf);
+ KUNIT_EXPECT_EQ(test, len, 16);
+ KUNIT_EXPECT_PTR_NE(test, buf, NULL);
+
+ memcpy(buf, "hello", 5);
+ seq_buf_commit(&s, 5);
+
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 5);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello");
+
+ len = seq_buf_get_buf(&s, &buf);
+ KUNIT_EXPECT_EQ(test, len, 11);
+ KUNIT_EXPECT_PTR_NE(test, buf, NULL);
+
+ memcpy(buf, " worlds!", 8);
+ seq_buf_commit(&s, 6);
+
+ KUNIT_EXPECT_EQ(test, seq_buf_used(&s), 11);
+ KUNIT_EXPECT_FALSE(test, seq_buf_has_overflowed(&s));
+ KUNIT_EXPECT_STREQ(test, seq_buf_str(&s), "hello world");
+
+ len = seq_buf_get_buf(&s, &buf);
+ KUNIT_EXPECT_EQ(test, len, 5);
+ KUNIT_EXPECT_PTR_NE(test, buf, NULL);
+
+ seq_buf_commit(&s, -1);
+ KUNIT_EXPECT_TRUE(test, seq_buf_has_overflowed(&s));
+}
+
+static struct kunit_case seq_buf_test_cases[] = {
+ KUNIT_CASE(seq_buf_init_test),
+ KUNIT_CASE(seq_buf_declare_test),
+ KUNIT_CASE(seq_buf_clear_test),
+ KUNIT_CASE(seq_buf_puts_test),
+ KUNIT_CASE(seq_buf_puts_overflow_test),
+ KUNIT_CASE(seq_buf_putc_test),
+ KUNIT_CASE(seq_buf_printf_test),
+ KUNIT_CASE(seq_buf_printf_overflow_test),
+ KUNIT_CASE(seq_buf_get_buf_commit_test),
+ {}
+};
+
+static struct kunit_suite seq_buf_test_suite = {
+ .name = "seq_buf",
+ .test_cases = seq_buf_test_cases,
+};
+
+kunit_test_suite(seq_buf_test_suite);
+
+MODULE_DESCRIPTION("Runtime test cases for seq_buf string API");
+MODULE_LICENSE("GPL");
diff --git a/lib/tests/test_bits.c b/lib/tests/test_bits.c
index 47325b41515f..ab88e50d2edf 100644
--- a/lib/tests/test_bits.c
+++ b/lib/tests/test_bits.c
@@ -26,6 +26,23 @@ static_assert(assert_type(u16, GENMASK_U16(15, 0)) == U16_MAX);
static_assert(assert_type(u32, GENMASK_U32(31, 0)) == U32_MAX);
static_assert(assert_type(u64, GENMASK_U64(63, 0)) == U64_MAX);
+/* FIXME: add a test case written in asm for GENMASK() and GENMASK_ULL() */
+
+static void __genmask_test(struct kunit *test)
+{
+ KUNIT_EXPECT_EQ(test, 1ul, __GENMASK(0, 0));
+ KUNIT_EXPECT_EQ(test, 3ul, __GENMASK(1, 0));
+ KUNIT_EXPECT_EQ(test, 6ul, __GENMASK(2, 1));
+ KUNIT_EXPECT_EQ(test, 0xFFFFFFFFul, __GENMASK(31, 0));
+}
+
+static void __genmask_ull_test(struct kunit *test)
+{
+ KUNIT_EXPECT_EQ(test, 1ull, __GENMASK_ULL(0, 0));
+ KUNIT_EXPECT_EQ(test, 3ull, __GENMASK_ULL(1, 0));
+ KUNIT_EXPECT_EQ(test, 0x000000ffffe00000ull, __GENMASK_ULL(39, 21));
+ KUNIT_EXPECT_EQ(test, 0xffffffffffffffffull, __GENMASK_ULL(63, 0));
+}
static void genmask_test(struct kunit *test)
{
@@ -123,6 +140,8 @@ static void genmask_input_check_test(struct kunit *test)
static struct kunit_case bits_test_cases[] = {
+ KUNIT_CASE(__genmask_test),
+ KUNIT_CASE(__genmask_ull_test),
KUNIT_CASE(genmask_test),
KUNIT_CASE(genmask_ull_test),
KUNIT_CASE(genmask_u128_test),
diff --git a/lib/tests/test_ratelimit.c b/lib/tests/test_ratelimit.c
new file mode 100644
index 000000000000..bfaeca49304a
--- /dev/null
+++ b/lib/tests/test_ratelimit.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <kunit/test.h>
+
+#include <linux/ratelimit.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/cpumask.h>
+
+/* a simple boot-time regression test */
+
+#define TESTRL_INTERVAL (5 * HZ)
+static DEFINE_RATELIMIT_STATE(testrl, TESTRL_INTERVAL, 3);
+
+#define test_ratelimited(test, expected) \
+ KUNIT_ASSERT_EQ(test, ___ratelimit(&testrl, "test_ratelimit_smoke"), (expected))
+
+static void test_ratelimit_smoke(struct kunit *test)
+{
+ // Check settings.
+ KUNIT_ASSERT_GE(test, TESTRL_INTERVAL, 100);
+
+ // Test normal operation.
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+ test_ratelimited(test, false);
+
+ schedule_timeout_idle(TESTRL_INTERVAL / 2);
+ test_ratelimited(test, false);
+
+ schedule_timeout_idle(TESTRL_INTERVAL * 3 / 4);
+ test_ratelimited(test, true);
+
+ schedule_timeout_idle(2 * TESTRL_INTERVAL);
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+
+ schedule_timeout_idle(TESTRL_INTERVAL / 2 );
+ test_ratelimited(test, true);
+ schedule_timeout_idle(TESTRL_INTERVAL * 3 / 4);
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+ test_ratelimited(test, false);
+
+ // Test disabling.
+ testrl.burst = 0;
+ test_ratelimited(test, false);
+ testrl.burst = 2;
+ testrl.interval = 0;
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+
+ // Testing re-enabling.
+ testrl.interval = TESTRL_INTERVAL;
+ test_ratelimited(test, true);
+ test_ratelimited(test, true);
+ test_ratelimited(test, false);
+ test_ratelimited(test, false);
+}
+
+static struct ratelimit_state stressrl = RATELIMIT_STATE_INIT_FLAGS("stressrl", HZ / 10, 3,
+ RATELIMIT_MSG_ON_RELEASE);
+
+static int doneflag;
+static const int stress_duration = 2 * HZ;
+
+struct stress_kthread {
+ unsigned long nattempts;
+ unsigned long nunlimited;
+ unsigned long nlimited;
+ unsigned long nmissed;
+ struct task_struct *tp;
+};
+
+static int test_ratelimit_stress_child(void *arg)
+{
+ struct stress_kthread *sktp = arg;
+
+ set_user_nice(current, MAX_NICE);
+ WARN_ON_ONCE(!sktp->tp);
+
+ while (!READ_ONCE(doneflag)) {
+ sktp->nattempts++;
+ if (___ratelimit(&stressrl, __func__))
+ sktp->nunlimited++;
+ else
+ sktp->nlimited++;
+ cond_resched();
+ }
+
+ sktp->nmissed = ratelimit_state_reset_miss(&stressrl);
+ return 0;
+}
+
+static void test_ratelimit_stress(struct kunit *test)
+{
+ int i;
+ const int n_stress_kthread = cpumask_weight(cpu_online_mask);
+ struct stress_kthread skt = { 0 };
+ struct stress_kthread *sktp = kcalloc(n_stress_kthread, sizeof(*sktp), GFP_KERNEL);
+
+ KUNIT_EXPECT_NOT_NULL_MSG(test, sktp, "Memory allocation failure");
+ for (i = 0; i < n_stress_kthread; i++) {
+ sktp[i].tp = kthread_run(test_ratelimit_stress_child, &sktp[i], "%s/%i",
+ "test_ratelimit_stress_child", i);
+ KUNIT_EXPECT_NOT_NULL_MSG(test, sktp, "kthread creation failure");
+ pr_alert("Spawned test_ratelimit_stress_child %d\n", i);
+ }
+ schedule_timeout_idle(stress_duration);
+ WRITE_ONCE(doneflag, 1);
+ for (i = 0; i < n_stress_kthread; i++) {
+ kthread_stop(sktp[i].tp);
+ skt.nattempts += sktp[i].nattempts;
+ skt.nunlimited += sktp[i].nunlimited;
+ skt.nlimited += sktp[i].nlimited;
+ skt.nmissed += sktp[i].nmissed;
+ }
+ KUNIT_ASSERT_EQ_MSG(test, skt.nunlimited + skt.nlimited, skt.nattempts,
+ "Outcomes not equal to attempts");
+ KUNIT_ASSERT_EQ_MSG(test, skt.nlimited, skt.nmissed, "Misses not equal to limits");
+}
+
+static struct kunit_case ratelimit_test_cases[] = {
+ KUNIT_CASE_SLOW(test_ratelimit_smoke),
+ KUNIT_CASE_SLOW(test_ratelimit_stress),
+ {}
+};
+
+static struct kunit_suite ratelimit_test_suite = {
+ .name = "lib_ratelimit",
+ .test_cases = ratelimit_test_cases,
+};
+
+kunit_test_suites(&ratelimit_test_suite);
+
+MODULE_DESCRIPTION("___ratelimit() KUnit test suite");
+MODULE_LICENSE("GPL");
diff --git a/lib/ubsan.c b/lib/ubsan.c
index a6ca235dd714..456e3dd8f4ea 100644
--- a/lib/ubsan.c
+++ b/lib/ubsan.c
@@ -333,18 +333,18 @@ EXPORT_SYMBOL(__ubsan_handle_implicit_conversion);
void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs)
{
struct overflow_data *data = _data;
- char rhs_val_str[VALUE_LENGTH];
+ char lhs_val_str[VALUE_LENGTH];
if (suppress_report(&data->location))
return;
ubsan_prologue(&data->location, "division-overflow");
- val_to_string(rhs_val_str, sizeof(rhs_val_str), data->type, rhs);
+ val_to_string(lhs_val_str, sizeof(lhs_val_str), data->type, lhs);
if (type_is_signed(data->type) && get_signed_val(data->type, rhs) == -1)
pr_err("division of %s by -1 cannot be represented in type %s\n",
- rhs_val_str, data->type->type_name);
+ lhs_val_str, data->type->type_name);
else
pr_err("division by zero\n");
diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
index 45df764b49ad..db87ba34ef19 100644
--- a/lib/vdso/Kconfig
+++ b/lib/vdso/Kconfig
@@ -12,24 +12,6 @@ config GENERIC_GETTIMEOFDAY
Each architecture that enables this feature has to
provide the fallback implementation.
-config GENERIC_VDSO_32
- bool
- depends on GENERIC_GETTIMEOFDAY && !64BIT
- help
- This config option helps to avoid possible performance issues
- in 32 bit only architectures.
-
-config GENERIC_COMPAT_VDSO
- bool
- help
- This config option enables the compat VDSO layer.
-
-config GENERIC_VDSO_TIME_NS
- bool
- help
- Selected by architectures which support time namespaces in the
- VDSO
-
config GENERIC_VDSO_OVERFLOW_PROTECT
bool
help
@@ -37,14 +19,9 @@ config GENERIC_VDSO_OVERFLOW_PROTECT
time getter functions for the price of an extra conditional
in the hotpath.
-endif
-
config VDSO_GETRANDOM
bool
help
Selected by architectures that support vDSO getrandom().
-config GENERIC_VDSO_DATA_STORE
- bool
- help
- Selected by architectures that use the generic vDSO data store.
+endif
diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile
index aedd40aaa950..405f743253d7 100644
--- a/lib/vdso/Makefile
+++ b/lib/vdso/Makefile
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_GENERIC_VDSO_DATA_STORE) += datastore.o
+obj-$(CONFIG_HAVE_GENERIC_VDSO) += datastore.o
diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c
index 3693c6caf2c4..a565c30c71a0 100644
--- a/lib/vdso/datastore.c
+++ b/lib/vdso/datastore.c
@@ -11,14 +11,14 @@
/*
* The vDSO data page.
*/
-#ifdef CONFIG_HAVE_GENERIC_VDSO
+#ifdef CONFIG_GENERIC_GETTIMEOFDAY
static union {
struct vdso_time_data data;
u8 page[PAGE_SIZE];
} vdso_time_data_store __page_aligned_data;
struct vdso_time_data *vdso_k_time_data = &vdso_time_data_store.data;
static_assert(sizeof(vdso_time_data_store) == PAGE_SIZE);
-#endif /* CONFIG_HAVE_GENERIC_VDSO */
+#endif /* CONFIG_GENERIC_GETTIMEOFDAY */
#ifdef CONFIG_VDSO_GETRANDOM
static union {
@@ -46,7 +46,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
switch (vmf->pgoff) {
case VDSO_TIME_PAGE_OFFSET:
- if (!IS_ENABLED(CONFIG_HAVE_GENERIC_VDSO))
+ if (!IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY))
return VM_FAULT_SIGBUS;
pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data));
if (timens_page) {
diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index 93ef801a97ef..95df0153f05a 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -2,6 +2,7 @@
/*
* Generic userspace implementations of gettimeofday() and similar.
*/
+#include <vdso/auxclock.h>
#include <vdso/datapage.h>
#include <vdso/helpers.h>
@@ -71,86 +72,90 @@ static inline bool vdso_cycles_ok(u64 cycles)
}
#endif
-#ifdef CONFIG_TIME_NS
+static __always_inline bool vdso_clockid_valid(clockid_t clock)
+{
+ /* Check for negative values or invalid clocks */
+ return likely((u32) clock <= CLOCK_AUX_LAST);
+}
+
+/*
+ * Must not be invoked within the sequence read section as a race inside
+ * that loop could result in __iter_div_u64_rem() being extremely slow.
+ */
+static __always_inline void vdso_set_timespec(struct __kernel_timespec *ts, u64 sec, u64 ns)
+{
+ ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+}
+
+static __always_inline
+bool vdso_get_timestamp(const struct vdso_time_data *vd, const struct vdso_clock *vc,
+ unsigned int clkidx, u64 *sec, u64 *ns)
+{
+ const struct vdso_timestamp *vdso_ts = &vc->basetime[clkidx];
+ u64 cycles;
+
+ if (unlikely(!vdso_clocksource_ok(vc)))
+ return false;
+
+ cycles = __arch_get_hw_counter(vc->clock_mode, vd);
+ if (unlikely(!vdso_cycles_ok(cycles)))
+ return false;
+
+ *ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec);
+ *sec = vdso_ts->sec;
+
+ return true;
+}
-#ifdef CONFIG_GENERIC_VDSO_DATA_STORE
static __always_inline
const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd)
{
return (void *)vd + PAGE_SIZE;
}
-#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */
static __always_inline
-int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns,
- clockid_t clk, struct __kernel_timespec *ts)
+bool do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns,
+ clockid_t clk, struct __kernel_timespec *ts)
{
const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns);
const struct timens_offset *offs = &vcns->offset[clk];
const struct vdso_clock *vc = vd->clock_data;
- const struct vdso_timestamp *vdso_ts;
- u64 cycles, ns;
u32 seq;
s64 sec;
+ u64 ns;
if (clk != CLOCK_MONOTONIC_RAW)
vc = &vc[CS_HRES_COARSE];
else
vc = &vc[CS_RAW];
- vdso_ts = &vc->basetime[clk];
do {
seq = vdso_read_begin(vc);
- if (unlikely(!vdso_clocksource_ok(vc)))
- return -1;
-
- cycles = __arch_get_hw_counter(vc->clock_mode, vd);
- if (unlikely(!vdso_cycles_ok(cycles)))
- return -1;
- ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec);
- sec = vdso_ts->sec;
+ if (!vdso_get_timestamp(vd, vc, clk, &sec, &ns))
+ return false;
} while (unlikely(vdso_read_retry(vc, seq)));
/* Add the namespace offset */
sec += offs->sec;
ns += offs->nsec;
- /*
- * Do this outside the loop: a race inside the loop could result
- * in __iter_div_u64_rem() being extremely slow.
- */
- ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
- ts->tv_nsec = ns;
+ vdso_set_timespec(ts, sec, ns);
- return 0;
-}
-#else
-static __always_inline
-const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd)
-{
- return NULL;
-}
-
-static __always_inline
-int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns,
- clockid_t clk, struct __kernel_timespec *ts)
-{
- return -EINVAL;
+ return true;
}
-#endif
static __always_inline
-int do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc,
- clockid_t clk, struct __kernel_timespec *ts)
+bool do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc,
+ clockid_t clk, struct __kernel_timespec *ts)
{
- const struct vdso_timestamp *vdso_ts = &vc->basetime[clk];
- u64 cycles, sec, ns;
+ u64 sec, ns;
u32 seq;
/* Allows to compile the high resolution parts out */
if (!__arch_vdso_hres_capable())
- return -1;
+ return false;
do {
/*
@@ -172,30 +177,18 @@ int do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc,
}
smp_rmb();
- if (unlikely(!vdso_clocksource_ok(vc)))
- return -1;
-
- cycles = __arch_get_hw_counter(vc->clock_mode, vd);
- if (unlikely(!vdso_cycles_ok(cycles)))
- return -1;
- ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec);
- sec = vdso_ts->sec;
+ if (!vdso_get_timestamp(vd, vc, clk, &sec, &ns))
+ return false;
} while (unlikely(vdso_read_retry(vc, seq)));
- /*
- * Do this outside the loop: a race inside the loop could result
- * in __iter_div_u64_rem() being extremely slow.
- */
- ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
- ts->tv_nsec = ns;
+ vdso_set_timespec(ts, sec, ns);
- return 0;
+ return true;
}
-#ifdef CONFIG_TIME_NS
static __always_inline
-int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns,
- clockid_t clk, struct __kernel_timespec *ts)
+bool do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns,
+ clockid_t clk, struct __kernel_timespec *ts)
{
const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns);
const struct timens_offset *offs = &vcns->offset[clk];
@@ -217,26 +210,14 @@ int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock
sec += offs->sec;
nsec += offs->nsec;
- /*
- * Do this outside the loop: a race inside the loop could result
- * in __iter_div_u64_rem() being extremely slow.
- */
- ts->tv_sec = sec + __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
- ts->tv_nsec = nsec;
- return 0;
-}
-#else
-static __always_inline
-int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns,
- clockid_t clk, struct __kernel_timespec *ts)
-{
- return -1;
+ vdso_set_timespec(ts, sec, nsec);
+
+ return true;
}
-#endif
static __always_inline
-int do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc,
- clockid_t clk, struct __kernel_timespec *ts)
+bool do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc,
+ clockid_t clk, struct __kernel_timespec *ts)
{
const struct vdso_timestamp *vdso_ts = &vc->basetime[clk];
u32 seq;
@@ -258,19 +239,60 @@ int do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc,
ts->tv_nsec = vdso_ts->nsec;
} while (unlikely(vdso_read_retry(vc, seq)));
- return 0;
+ return true;
}
-static __always_inline int
+static __always_inline
+bool do_aux(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts)
+{
+ const struct vdso_clock *vc;
+ u32 seq, idx;
+ u64 sec, ns;
+
+ if (!IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
+ return false;
+
+ idx = clock - CLOCK_AUX;
+ vc = &vd->aux_clock_data[idx];
+
+ do {
+ /*
+ * Open coded function vdso_read_begin() to handle
+ * VDSO_CLOCK_TIMENS. See comment in do_hres().
+ */
+ while ((seq = READ_ONCE(vc->seq)) & 1) {
+ if (IS_ENABLED(CONFIG_TIME_NS) && vc->clock_mode == VDSO_CLOCKMODE_TIMENS) {
+ vd = __arch_get_vdso_u_timens_data(vd);
+ vc = &vd->aux_clock_data[idx];
+ /* Re-read from the real time data page */
+ continue;
+ }
+ cpu_relax();
+ }
+ smp_rmb();
+
+ /* Auxclock disabled? */
+ if (vc->clock_mode == VDSO_CLOCKMODE_NONE)
+ return false;
+
+ if (!vdso_get_timestamp(vd, vc, VDSO_BASE_AUX, &sec, &ns))
+ return false;
+ } while (unlikely(vdso_read_retry(vc, seq)));
+
+ vdso_set_timespec(ts, sec, ns);
+
+ return true;
+}
+
+static __always_inline bool
__cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock,
struct __kernel_timespec *ts)
{
const struct vdso_clock *vc = vd->clock_data;
u32 msk;
- /* Check for negative values or invalid clocks */
- if (unlikely((u32) clock >= MAX_CLOCKS))
- return -1;
+ if (!vdso_clockid_valid(clock))
+ return false;
/*
* Convert the clockid to a bitmask and use it to check which
@@ -283,8 +305,10 @@ __cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock,
return do_coarse(vd, &vc[CS_HRES_COARSE], clock, ts);
else if (msk & VDSO_RAW)
vc = &vc[CS_RAW];
+ else if (msk & VDSO_AUX)
+ return do_aux(vd, clock, ts);
else
- return -1;
+ return false;
return do_hres(vd, vc, clock, ts);
}
@@ -293,9 +317,11 @@ static __maybe_unused int
__cvdso_clock_gettime_data(const struct vdso_time_data *vd, clockid_t clock,
struct __kernel_timespec *ts)
{
- int ret = __cvdso_clock_gettime_common(vd, clock, ts);
+ bool ok;
+
+ ok = __cvdso_clock_gettime_common(vd, clock, ts);
- if (unlikely(ret))
+ if (unlikely(!ok))
return clock_gettime_fallback(clock, ts);
return 0;
}
@@ -312,18 +338,18 @@ __cvdso_clock_gettime32_data(const struct vdso_time_data *vd, clockid_t clock,
struct old_timespec32 *res)
{
struct __kernel_timespec ts;
- int ret;
+ bool ok;
- ret = __cvdso_clock_gettime_common(vd, clock, &ts);
+ ok = __cvdso_clock_gettime_common(vd, clock, &ts);
- if (unlikely(ret))
+ if (unlikely(!ok))
return clock_gettime32_fallback(clock, res);
- /* For ret == 0 */
+ /* For ok == true */
res->tv_sec = ts.tv_sec;
res->tv_nsec = ts.tv_nsec;
- return ret;
+ return 0;
}
static __maybe_unused int
@@ -342,7 +368,7 @@ __cvdso_gettimeofday_data(const struct vdso_time_data *vd,
if (likely(tv != NULL)) {
struct __kernel_timespec ts;
- if (do_hres(vd, &vc[CS_HRES_COARSE], CLOCK_REALTIME, &ts))
+ if (!do_hres(vd, &vc[CS_HRES_COARSE], CLOCK_REALTIME, &ts))
return gettimeofday_fallback(tv, tz);
tv->tv_sec = ts.tv_sec;
@@ -396,16 +422,15 @@ static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t *time
#ifdef VDSO_HAS_CLOCK_GETRES
static __maybe_unused
-int __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock,
- struct __kernel_timespec *res)
+bool __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock,
+ struct __kernel_timespec *res)
{
const struct vdso_clock *vc = vd->clock_data;
u32 msk;
u64 ns;
- /* Check for negative values or invalid clocks */
- if (unlikely((u32) clock >= MAX_CLOCKS))
- return -1;
+ if (!vdso_clockid_valid(clock))
+ return false;
if (IS_ENABLED(CONFIG_TIME_NS) &&
vc->clock_mode == VDSO_CLOCKMODE_TIMENS)
@@ -426,24 +451,28 @@ int __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock
* Preserves the behaviour of posix_get_coarse_res().
*/
ns = LOW_RES_NSEC;
+ } else if (msk & VDSO_AUX) {
+ ns = aux_clock_resolution_ns();
} else {
- return -1;
+ return false;
}
if (likely(res)) {
res->tv_sec = 0;
res->tv_nsec = ns;
}
- return 0;
+ return true;
}
static __maybe_unused
int __cvdso_clock_getres_data(const struct vdso_time_data *vd, clockid_t clock,
struct __kernel_timespec *res)
{
- int ret = __cvdso_clock_getres_common(vd, clock, res);
+ bool ok;
- if (unlikely(ret))
+ ok = __cvdso_clock_getres_common(vd, clock, res);
+
+ if (unlikely(!ok))
return clock_getres_fallback(clock, res);
return 0;
}
@@ -460,18 +489,18 @@ __cvdso_clock_getres_time32_data(const struct vdso_time_data *vd, clockid_t cloc
struct old_timespec32 *res)
{
struct __kernel_timespec ts;
- int ret;
+ bool ok;
- ret = __cvdso_clock_getres_common(vd, clock, &ts);
+ ok = __cvdso_clock_getres_common(vd, clock, &ts);
- if (unlikely(ret))
+ if (unlikely(!ok))
return clock_getres32_fallback(clock, res);
if (likely(res)) {
res->tv_sec = ts.tv_sec;
res->tv_nsec = ts.tv_nsec;
}
- return ret;
+ return 0;
}
static __maybe_unused int
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3d85800757aa..eb0cb11d0d12 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -60,6 +60,20 @@
bool no_hash_pointers __ro_after_init;
EXPORT_SYMBOL_GPL(no_hash_pointers);
+/*
+ * Hashed pointers policy selected by "hash_pointers=..." boot param
+ *
+ * `auto` - Hashed pointers enabled unless disabled by slub_debug_enabled=true
+ * `always` - Hashed pointers enabled unconditionally
+ * `never` - Hashed pointers disabled unconditionally
+ */
+enum hash_pointers_policy {
+ HASH_PTR_AUTO = 0,
+ HASH_PTR_ALWAYS,
+ HASH_PTR_NEVER
+};
+static enum hash_pointers_policy hash_pointers_mode __initdata;
+
noinline
static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars)
{
@@ -1699,10 +1713,9 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
return buf;
}
-#pragma GCC diagnostic push
-#ifndef __clang__
-#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
-#endif
+__diag_push();
+__diag_ignore(GCC, all, "-Wsuggest-attribute=format",
+ "Not a valid __printf() conversion candidate.");
static char *va_format(char *buf, char *end, struct va_format *va_fmt,
struct printf_spec spec)
{
@@ -1717,7 +1730,7 @@ static char *va_format(char *buf, char *end, struct va_format *va_fmt,
return buf;
}
-#pragma GCC diagnostic pop
+__diag_pop();
static noinline_for_stack
char *uuid_string(char *buf, char *end, const u8 *addr,
@@ -2289,12 +2302,23 @@ char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr,
return resource_string(buf, end, ptr, spec, fmt);
}
-int __init no_hash_pointers_enable(char *str)
+void __init hash_pointers_finalize(bool slub_debug)
{
- if (no_hash_pointers)
- return 0;
+ switch (hash_pointers_mode) {
+ case HASH_PTR_ALWAYS:
+ no_hash_pointers = false;
+ break;
+ case HASH_PTR_NEVER:
+ no_hash_pointers = true;
+ break;
+ case HASH_PTR_AUTO:
+ default:
+ no_hash_pointers = slub_debug;
+ break;
+ }
- no_hash_pointers = true;
+ if (!no_hash_pointers)
+ return;
pr_warn("**********************************************************\n");
pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
@@ -2307,11 +2331,39 @@ int __init no_hash_pointers_enable(char *str)
pr_warn("** the kernel, report this immediately to your system **\n");
pr_warn("** administrator! **\n");
pr_warn("** **\n");
+ pr_warn("** Use hash_pointers=always to force this mode off **\n");
+ pr_warn("** **\n");
pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
pr_warn("**********************************************************\n");
+}
+
+static int __init hash_pointers_mode_parse(char *str)
+{
+ if (!str) {
+ pr_warn("Hash pointers mode empty; falling back to auto.\n");
+ hash_pointers_mode = HASH_PTR_AUTO;
+ } else if (strncmp(str, "auto", 4) == 0) {
+ pr_info("Hash pointers mode set to auto.\n");
+ hash_pointers_mode = HASH_PTR_AUTO;
+ } else if (strncmp(str, "never", 5) == 0) {
+ pr_info("Hash pointers mode set to never.\n");
+ hash_pointers_mode = HASH_PTR_NEVER;
+ } else if (strncmp(str, "always", 6) == 0) {
+ pr_info("Hash pointers mode set to always.\n");
+ hash_pointers_mode = HASH_PTR_ALWAYS;
+ } else {
+ pr_warn("Unknown hash_pointers mode '%s' specified; assuming auto.\n", str);
+ hash_pointers_mode = HASH_PTR_AUTO;
+ }
return 0;
}
+early_param("hash_pointers", hash_pointers_mode_parse);
+
+static int __init no_hash_pointers_enable(char *str)
+{
+ return hash_pointers_mode_parse("never");
+}
early_param("no_hash_pointers", no_hash_pointers_enable);
/*
diff --git a/lib/xarray.c b/lib/xarray.c
index 76dde3a1cacf..9a8b4916540c 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -370,7 +370,7 @@ static void *xas_alloc(struct xa_state *xas, unsigned int shift)
if (node) {
xas->xa_alloc = NULL;
} else {
- gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;
+ gfp_t gfp = GFP_NOWAIT;
if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
gfp |= __GFP_ACCOUNT;
@@ -1910,6 +1910,7 @@ EXPORT_SYMBOL(xa_store_range);
* @xas: XArray operation state.
*
* Called after xas_load, the xas should not be in an error state.
+ * The xas should not be pointing to a sibling entry.
*
* Return: A number between 0 and 63 indicating the order of the entry.
*/
@@ -1920,6 +1921,8 @@ int xas_get_order(struct xa_state *xas)
if (!xas->xa_node)
return 0;
+ XA_NODE_BUG_ON(xas->xa_node, xa_is_sibling(xa_entry(xas->xa,
+ xas->xa_node, xas->xa_offset)));
for (;;) {
unsigned int slot = xas->xa_offset + (1 << order);
diff --git a/lib/xxhash.c b/lib/xxhash.c
index b5bd567aa6b3..cf629766f376 100644
--- a/lib/xxhash.c
+++ b/lib/xxhash.c
@@ -267,113 +267,6 @@ void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed)
}
EXPORT_SYMBOL(xxh64_reset);
-int xxh32_update(struct xxh32_state *state, const void *input, const size_t len)
-{
- const uint8_t *p = (const uint8_t *)input;
- const uint8_t *const b_end = p + len;
-
- if (input == NULL)
- return -EINVAL;
-
- state->total_len_32 += (uint32_t)len;
- state->large_len |= (len >= 16) | (state->total_len_32 >= 16);
-
- if (state->memsize + len < 16) { /* fill in tmp buffer */
- memcpy((uint8_t *)(state->mem32) + state->memsize, input, len);
- state->memsize += (uint32_t)len;
- return 0;
- }
-
- if (state->memsize) { /* some data left from previous update */
- const uint32_t *p32 = state->mem32;
-
- memcpy((uint8_t *)(state->mem32) + state->memsize, input,
- 16 - state->memsize);
-
- state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32));
- p32++;
- state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32));
- p32++;
- state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32));
- p32++;
- state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32));
- p32++;
-
- p += 16-state->memsize;
- state->memsize = 0;
- }
-
- if (p <= b_end - 16) {
- const uint8_t *const limit = b_end - 16;
- uint32_t v1 = state->v1;
- uint32_t v2 = state->v2;
- uint32_t v3 = state->v3;
- uint32_t v4 = state->v4;
-
- do {
- v1 = xxh32_round(v1, get_unaligned_le32(p));
- p += 4;
- v2 = xxh32_round(v2, get_unaligned_le32(p));
- p += 4;
- v3 = xxh32_round(v3, get_unaligned_le32(p));
- p += 4;
- v4 = xxh32_round(v4, get_unaligned_le32(p));
- p += 4;
- } while (p <= limit);
-
- state->v1 = v1;
- state->v2 = v2;
- state->v3 = v3;
- state->v4 = v4;
- }
-
- if (p < b_end) {
- memcpy(state->mem32, p, (size_t)(b_end-p));
- state->memsize = (uint32_t)(b_end-p);
- }
-
- return 0;
-}
-EXPORT_SYMBOL(xxh32_update);
-
-uint32_t xxh32_digest(const struct xxh32_state *state)
-{
- const uint8_t *p = (const uint8_t *)state->mem32;
- const uint8_t *const b_end = (const uint8_t *)(state->mem32) +
- state->memsize;
- uint32_t h32;
-
- if (state->large_len) {
- h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) +
- xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18);
- } else {
- h32 = state->v3 /* == seed */ + PRIME32_5;
- }
-
- h32 += state->total_len_32;
-
- while (p + 4 <= b_end) {
- h32 += get_unaligned_le32(p) * PRIME32_3;
- h32 = xxh_rotl32(h32, 17) * PRIME32_4;
- p += 4;
- }
-
- while (p < b_end) {
- h32 += (*p) * PRIME32_5;
- h32 = xxh_rotl32(h32, 11) * PRIME32_1;
- p++;
- }
-
- h32 ^= h32 >> 15;
- h32 *= PRIME32_2;
- h32 ^= h32 >> 13;
- h32 *= PRIME32_3;
- h32 ^= h32 >> 16;
-
- return h32;
-}
-EXPORT_SYMBOL(xxh32_digest);
-
int xxh64_update(struct xxh64_state *state, const void *input, const size_t len)
{
const uint8_t *p = (const uint8_t *)input;