lib/crypto: arm64/nh: Migrate optimized code into library

Migrate the arm64 NEON implementation of NH into lib/crypto/. This makes the nh() function be optimized on arm64 kernels. Note: this temporarily makes the adiantum template not utilize the arm64 optimized NH code. This is resolved in a later commit that converts the adiantum template to use nh() instead of "nhpoly1305". Link: https://lore.kernel.org/r/20251211011846.8179-5-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
author: Eric Biggers <ebiggers@kernel.org> 2025-12-10 17:18:36 -0800
committer: Eric Biggers <ebiggers@kernel.org> 2026-01-12 11:07:50 -0800
commit: b4a8528d17fbcd9027290c168efd6ba7ac4d4cd2 (patch)
tree: 3ec1a8d6beed6d790c74aed8bc7f6ebbb0723f55 /lib
parent: 29e39a11f541d068ed7155368f4a79aa0ddf9c52 (diff)
4 files changed, 139 insertions, 0 deletions
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index c6ee7ca77632..aa3f850ece24 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -118,6 +118,7 @@ config CRYPTO_LIB_NH_ARCH
 	bool
 	depends on CRYPTO_LIB_NH && !UML
 	default y if ARM && KERNEL_MODE_NEON
+	default y if ARM64 && KERNEL_MODE_NEON
 
 config CRYPTO_LIB_POLY1305
 	tristate
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 6dae7e182847..e3a13952bc2a 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -136,6 +136,7 @@ libnh-y := nh.o
 ifeq ($(CONFIG_CRYPTO_LIB_NH_ARCH),y)
 CFLAGS_nh.o += -I$(src)/$(SRCARCH)
 libnh-$(CONFIG_ARM) += arm/nh-neon-core.o
+libnh-$(CONFIG_ARM64) += arm64/nh-neon-core.o
 endif
 
 ################################################################################
diff --git a/lib/crypto/arm64/nh-neon-core.S b/lib/crypto/arm64/nh-neon-core.S
new file mode 100644
index 000000000000..6fa57fce8085
--- /dev/null
+++ b/lib/crypto/arm64/nh-neon-core.S
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+	KEY		.req	x0
+	MESSAGE		.req	x1
+	MESSAGE_LEN	.req	x2
+	HASH		.req	x3
+
+	PASS0_SUMS	.req	v0
+	PASS1_SUMS	.req	v1
+	PASS2_SUMS	.req	v2
+	PASS3_SUMS	.req	v3
+	K0		.req	v4
+	K1		.req	v5
+	K2		.req	v6
+	K3		.req	v7
+	T0		.req	v8
+	T1		.req	v9
+	T2		.req	v10
+	T3		.req	v11
+	T4		.req	v12
+	T5		.req	v13
+	T6		.req	v14
+	T7		.req	v15
+
+.macro _nh_stride	k0, k1, k2, k3
+
+	// Load next message stride
+	ld1		{T3.16b}, [MESSAGE], #16
+
+	// Load next key stride
+	ld1		{\k3\().4s}, [KEY], #16
+
+	// Add message words to key words
+	add		T0.4s, T3.4s, \k0\().4s
+	add		T1.4s, T3.4s, \k1\().4s
+	add		T2.4s, T3.4s, \k2\().4s
+	add		T3.4s, T3.4s, \k3\().4s
+
+	// Multiply 32x32 => 64 and accumulate
+	mov		T4.d[0], T0.d[1]
+	mov		T5.d[0], T1.d[1]
+	mov		T6.d[0], T2.d[1]
+	mov		T7.d[0], T3.d[1]
+	umlal		PASS0_SUMS.2d, T0.2s, T4.2s
+	umlal		PASS1_SUMS.2d, T1.2s, T5.2s
+	umlal		PASS2_SUMS.2d, T2.2s, T6.2s
+	umlal		PASS3_SUMS.2d, T3.2s, T7.2s
+.endm
+
+/*
+ * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ *		__le64 hash[NH_NUM_PASSES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+SYM_FUNC_START(nh_neon)
+
+	ld1		{K0.4s,K1.4s}, [KEY], #32
+	  movi		PASS0_SUMS.2d, #0
+	  movi		PASS1_SUMS.2d, #0
+	ld1		{K2.4s}, [KEY], #16
+	  movi		PASS2_SUMS.2d, #0
+	  movi		PASS3_SUMS.2d, #0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	blt		.Lloop4_done
+.Lloop4:
+	_nh_stride	K0, K1, K2, K3
+	_nh_stride	K1, K2, K3, K0
+	_nh_stride	K2, K3, K0, K1
+	_nh_stride	K3, K0, K1, K2
+	subs		MESSAGE_LEN, MESSAGE_LEN, #64
+	bge		.Lloop4
+
+.Lloop4_done:
+	ands		MESSAGE_LEN, MESSAGE_LEN, #63
+	beq		.Ldone
+	_nh_stride	K0, K1, K2, K3
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K1, K2, K3, K0
+
+	subs		MESSAGE_LEN, MESSAGE_LEN, #16
+	beq		.Ldone
+	_nh_stride	K2, K3, K0, K1
+
+.Ldone:
+	// Sum the accumulators for each pass, then store the sums to 'hash'
+	addp		T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
+	addp		T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
+	st1		{T0.16b,T1.16b}, [HASH]
+	ret
+SYM_FUNC_END(nh_neon)
diff --git a/lib/crypto/arm64/nh.h b/lib/crypto/arm64/nh.h
new file mode 100644
index 000000000000..08902630bdd1
--- /dev/null
+++ b/lib/crypto/arm64/nh.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ARM64 accelerated implementation of NH
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+			__le64 hash[NH_NUM_PASSES]);
+
+static bool nh_arch(const u32 *key, const u8 *message, size_t message_len,
+		    __le64 hash[NH_NUM_PASSES])
+{
+	if (static_branch_likely(&have_neon) && message_len >= 64 &&
+	    may_use_simd()) {
+		scoped_ksimd()
+			nh_neon(key, message, message_len, hash);
+		return true;
+	}
+	return false;
+}
+
+#define nh_mod_init_arch nh_mod_init_arch
+static void nh_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(ASIMD))
+		static_branch_enable(&have_neon);
+}
author	Eric Biggers <ebiggers@kernel.org>	2025-12-10 17:18:36 -0800
committer	Eric Biggers <ebiggers@kernel.org>	2026-01-12 11:07:50 -0800
commit	b4a8528d17fbcd9027290c168efd6ba7ac4d4cd2 (patch)
tree	3ec1a8d6beed6d790c74aed8bc7f6ebbb0723f55 /lib
parent	29e39a11f541d068ed7155368f4a79aa0ddf9c52 (diff)