From 85c9f3a2b805eb96d899da7bcc38a16459aa3c16 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Thu, 5 Mar 2026 19:35:55 -0800
Subject: lib/crc: tests: Make crc_kunit test only the enabled CRC variants

Like commit 4478e8eeb871 ("lib/crypto: tests: Depend on library options
rather than selecting them") did with the crypto library tests, make
crc_kunit depend on the code it tests rather than selecting it.  This
follows the standard convention for KUnit and fixes an issue where
enabling KUNIT_ALL_TESTS enabled non-test code.

crc_kunit does differ from the crypto library tests in that it
consolidates the tests for multiple CRC variants, with 5 kconfig
options, into one KUnit suite.  Since depending on *all* of these
kconfig options would greatly restrict the ability to enable crc_kunit,
instead just depend on *any* of these options.  Update crc_kunit
accordingly to test only the reachable code.

Alternatively we could split crc_kunit into 5 test suites.  But keeping
it as one is simpler for now.

Fixes: e47d9b1a76ed ("lib/crc_kunit.c: add KUnit test suite for CRC library functions")
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20260306033557.250499-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crc/Kconfig           |  7 +------
 lib/crc/tests/crc_kunit.c | 28 ++++++++++++++++++++++------
 2 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'lib')

diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
index 70e7a6016de3..9ddfd1a29757 100644
--- a/lib/crc/Kconfig
+++ b/lib/crc/Kconfig
@@ -99,13 +99,8 @@ config CRC_OPTIMIZATIONS
 
 config CRC_KUNIT_TEST
 	tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS
-	depends on KUNIT
+	depends on KUNIT && (CRC7 || CRC16 || CRC_T10DIF || CRC32 || CRC64)
 	default KUNIT_ALL_TESTS
-	select CRC7
-	select CRC16
-	select CRC_T10DIF
-	select CRC32
-	select CRC64
 	help
 	  Unit tests for the CRC library functions.
 
diff --git a/lib/crc/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c
index 9a450e25ac81..9428cd913625 100644
--- a/lib/crc/tests/crc_kunit.c
+++ b/lib/crc/tests/crc_kunit.c
@@ -268,8 +268,7 @@ crc_benchmark(struct kunit *test,
 	}
 }
 
-/* crc7_be */
-
+#if IS_REACHABLE(CONFIG_CRC7)
 static u64 crc7_be_wrapper(u64 crc, const u8 *p, size_t len)
 {
 	/*
@@ -294,9 +293,9 @@ static void crc7_be_benchmark(struct kunit *test)
 {
 	crc_benchmark(test, crc7_be_wrapper);
 }
+#endif /* CONFIG_CRC7 */
 
-/* crc16 */
-
+#if IS_REACHABLE(CONFIG_CRC16)
 static u64 crc16_wrapper(u64 crc, const u8 *p, size_t len)
 {
 	return crc16(crc, p, len);
@@ -318,9 +317,9 @@ static void crc16_benchmark(struct kunit *test)
 {
 	crc_benchmark(test, crc16_wrapper);
 }
+#endif /* CONFIG_CRC16 */
 
-/* crc_t10dif */
-
+#if IS_REACHABLE(CONFIG_CRC_T10DIF)
 static u64 crc_t10dif_wrapper(u64 crc, const u8 *p, size_t len)
 {
 	return crc_t10dif_update(crc, p, len);
@@ -342,6 +341,9 @@ static void crc_t10dif_benchmark(struct kunit *test)
 {
 	crc_benchmark(test, crc_t10dif_wrapper);
 }
+#endif /* CONFIG_CRC_T10DIF */
+
+#if IS_REACHABLE(CONFIG_CRC32)
 
 /* crc32_le */
 
@@ -414,6 +416,9 @@ static void crc32c_benchmark(struct kunit *test)
 {
 	crc_benchmark(test, crc32c_wrapper);
 }
+#endif /* CONFIG_CRC32 */
+
+#if IS_REACHABLE(CONFIG_CRC64)
 
 /* crc64_be */
 
@@ -463,24 +468,35 @@ static void crc64_nvme_benchmark(struct kunit *test)
 {
 	crc_benchmark(test, crc64_nvme_wrapper);
 }
+#endif /* CONFIG_CRC64 */
 
 static struct kunit_case crc_test_cases[] = {
+#if IS_REACHABLE(CONFIG_CRC7)
 	KUNIT_CASE(crc7_be_test),
 	KUNIT_CASE(crc7_be_benchmark),
+#endif
+#if IS_REACHABLE(CONFIG_CRC16)
 	KUNIT_CASE(crc16_test),
 	KUNIT_CASE(crc16_benchmark),
+#endif
+#if IS_REACHABLE(CONFIG_CRC_T10DIF)
 	KUNIT_CASE(crc_t10dif_test),
 	KUNIT_CASE(crc_t10dif_benchmark),
+#endif
+#if IS_REACHABLE(CONFIG_CRC32)
 	KUNIT_CASE(crc32_le_test),
 	KUNIT_CASE(crc32_le_benchmark),
 	KUNIT_CASE(crc32_be_test),
 	KUNIT_CASE(crc32_be_benchmark),
 	KUNIT_CASE(crc32c_test),
 	KUNIT_CASE(crc32c_benchmark),
+#endif
+#if IS_REACHABLE(CONFIG_CRC64)
 	KUNIT_CASE(crc64_be_test),
 	KUNIT_CASE(crc64_be_benchmark),
 	KUNIT_CASE(crc64_nvme_test),
 	KUNIT_CASE(crc64_nvme_benchmark),
+#endif
 	{},
 };
 
-- 
cgit v1.2.3


From cdf22aeaad8430905c3aa3b3d0f2686c65395c22 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Thu, 5 Mar 2026 19:35:56 -0800
Subject: lib/crc: tests: Add CRC_ENABLE_ALL_FOR_KUNIT

Now that crc_kunit uses the standard "depends on" pattern, enabling the
full set of CRC tests is a bit difficult, mainly due to CRC7 being
rarely used.  Add a kconfig option to make it easier.  It is visible
only when KUNIT, so hopefully the extra prompt won't be too annoying.

Link: https://lore.kernel.org/r/20260306033557.250499-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crc/Kconfig | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'lib')

diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
index 9ddfd1a29757..cca228879bb5 100644
--- a/lib/crc/Kconfig
+++ b/lib/crc/Kconfig
@@ -107,6 +107,20 @@ config CRC_KUNIT_TEST
 	  This is intended to help people writing architecture-specific
 	  optimized versions.  If unsure, say N.
 
+config CRC_ENABLE_ALL_FOR_KUNIT
+	tristate "Enable all CRC functions for KUnit test"
+	depends on KUNIT
+	select CRC7
+	select CRC16
+	select CRC_T10DIF
+	select CRC32
+	select CRC64
+	help
+	  Enable all CRC functions that have test code in CRC_KUNIT_TEST.
+
+	  Enable this only if you'd like the CRC KUnit test suite to test all
+	  the CRC variants, even ones that wouldn't otherwise need to be built.
+
 config CRC_BENCHMARK
 	bool "Benchmark for the CRC functions"
 	depends on CRC_KUNIT_TEST
-- 
cgit v1.2.3


From c13cee2fc7f137dd25ed50c63eddcc578624f204 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Thu, 5 Mar 2026 19:35:57 -0800
Subject: lib/crc: tests: Add a .kunitconfig file

Add a .kunitconfig file to the lib/crc/ directory so that the CRC
library tests can be run more easily using kunit.py.  Example with UML:

    tools/testing/kunit/kunit.py run --kunitconfig=lib/crc

Example with QEMU:

    tools/testing/kunit/kunit.py run --kunitconfig=lib/crc --arch=arm64 --make_options LLVM=1

Link: https://lore.kernel.org/r/20260306033557.250499-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crc/.kunitconfig | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 lib/crc/.kunitconfig

(limited to 'lib')

diff --git a/lib/crc/.kunitconfig b/lib/crc/.kunitconfig
new file mode 100644
index 000000000000..0a3671ba573f
--- /dev/null
+++ b/lib/crc/.kunitconfig
@@ -0,0 +1,3 @@
+CONFIG_KUNIT=y
+CONFIG_CRC_ENABLE_ALL_FOR_KUNIT=y
+CONFIG_CRC_KUNIT_TEST=y
-- 
cgit v1.2.3


From 6e4d63e8993c681e1cec7d564b4e018e21e658d0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 14 Mar 2026 10:57:44 -0700
Subject: lib/crc: arm64: Drop check for CONFIG_KERNEL_MODE_NEON

CONFIG_KERNEL_MODE_NEON is always enabled on arm64, and it always has
been since its introduction in 2013.  Given that and the fact that the
usefulness of kernel-mode NEON has only been increasing over time,
checking for this option in arm64-specific code is unnecessary.  Remove
this check from lib/crc/ to simplify the code and prevent any future
bugs where e.g. code gets disabled due to a typo in this logic.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20260314175744.30620-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
index cca228879bb5..52e216f39746 100644
--- a/lib/crc/Kconfig
+++ b/lib/crc/Kconfig
@@ -48,7 +48,7 @@ config CRC_T10DIF_ARCH
 	bool
 	depends on CRC_T10DIF && CRC_OPTIMIZATIONS
 	default y if ARM && KERNEL_MODE_NEON
-	default y if ARM64 && KERNEL_MODE_NEON
+	default y if ARM64
 	default y if PPC64 && ALTIVEC
 	default y if RISCV && RISCV_ISA_ZBC
 	default y if X86
-- 
cgit v1.2.3


From 63432fd625372a0e79fb00a4009af204f4edc013 Mon Sep 17 00:00:00 2001
From: Demian Shulhan <demyansh@gmail.com>
Date: Sun, 29 Mar 2026 07:43:38 +0000
Subject: lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation

Implement an optimized CRC64 (NVMe) algorithm for ARM64 using NEON
Polynomial Multiply Long (PMULL) instructions. The generic shift-and-XOR
software implementation is slow, which creates a bottleneck in NVMe and
other storage subsystems.

The acceleration is implemented using C intrinsics (<arm_neon.h>) rather
than raw assembly for better readability and maintainability.

Key highlights of this implementation:
- Uses 4KB chunking inside scoped_ksimd() to avoid preemption latency
  spikes on large buffers.
- Pre-calculates and loads fold constants via vld1q_u64() to minimize
  register spilling.
- Benchmarks show the break-even point against the generic implementation
  is around 128 bytes. The PMULL path is enabled only for len >= 128.

Performance results (kunit crc_benchmark on Cortex-A72):
- Generic (len=4096): ~268 MB/s
- PMULL (len=4096): ~1556 MB/s (nearly 6x improvement)

Signed-off-by: Demian Shulhan <demyansh@gmail.com>
Link: https://lore.kernel.org/r/20260329074338.1053550-1-demyansh@gmail.com
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crc/Kconfig                  |  1 +
 lib/crc/Makefile                 |  8 ++++-
 lib/crc/arm64/crc64-neon-inner.c | 78 ++++++++++++++++++++++++++++++++++++++++
 lib/crc/arm64/crc64.h            | 30 ++++++++++++++++
 4 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 lib/crc/arm64/crc64-neon-inner.c
 create mode 100644 lib/crc/arm64/crc64.h

(limited to 'lib')

diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
index 52e216f39746..31038c8d111a 100644
--- a/lib/crc/Kconfig
+++ b/lib/crc/Kconfig
@@ -82,6 +82,7 @@ config CRC64
 config CRC64_ARCH
 	bool
 	depends on CRC64 && CRC_OPTIMIZATIONS
+	default y if ARM64
 	default y if RISCV && RISCV_ISA_ZBC && 64BIT
 	default y if X86_64
 
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
index 7543ad295ab6..c9c35419b39c 100644
--- a/lib/crc/Makefile
+++ b/lib/crc/Makefile
@@ -38,9 +38,15 @@ obj-$(CONFIG_CRC64) += crc64.o
 crc64-y := crc64-main.o
 ifeq ($(CONFIG_CRC64_ARCH),y)
 CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
+
+CFLAGS_REMOVE_arm64/crc64-neon-inner.o += -mgeneral-regs-only
+CFLAGS_arm64/crc64-neon-inner.o += -ffreestanding -march=armv8-a+crypto
+CFLAGS_arm64/crc64-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
+crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o
+
 crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
 crc64-$(CONFIG_X86) += x86/crc64-pclmul.o
-endif
+endif # CONFIG_CRC64_ARCH
 
 obj-y += tests/
 
diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c
new file mode 100644
index 000000000000..881cdafadb37
--- /dev/null
+++ b/lib/crc/arm64/crc64-neon-inner.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC64 (NVMe) using ARM NEON C intrinsics
+ */
+
+#include <linux/types.h>
+#include <asm/neon-intrinsics.h>
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
+
+#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0))
+#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1))
+
+/* x^191 mod G, x^127 mod G */
+static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
+					0x21e9761e252621acULL };
+/* floor(x^127 / G), (G - x^64) / x */
+static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL,
+				    0x34d926535897936aULL };
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+{
+	uint64x2_t v0_u64 = { crc, 0 };
+	poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64);
+	poly64x2_t fold_consts =
+		vreinterpretq_p64_u64(vld1q_u64(fold_consts_val));
+	poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+
+	v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0),
+					   vreinterpretq_u8_p64(v1)));
+	p += 16;
+	len -= 16;
+
+	do {
+		v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+
+		poly128_t v2 = vmull_high_p64(fold_consts, v0);
+		poly128_t v0_128 =
+			vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0));
+
+		uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128),
+					 vreinterpretq_u8_p128(v2));
+
+		x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1));
+		v0 = vreinterpretq_p64_u8(x0);
+
+		p += 16;
+		len -= 16;
+	} while (len >= 16);
+
+	/* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */
+	poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){ 0, 0 });
+	poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0));
+
+	uint8x16_t ext_v0 =
+		vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8);
+	uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128));
+
+	v0 = vreinterpretq_p64_u8(x0);
+
+	/* Final Barrett reduction */
+	poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val));
+
+	v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0));
+
+	poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128));
+	poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64));
+
+	x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128));
+
+	uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7),
+				     vreinterpretq_u8_p128(v1_128), 8);
+
+	x0 = veorq_u8(x0, ext_v2);
+
+	v0 = vreinterpretq_p64_u8(x0);
+	return vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1);
+}
diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h
new file mode 100644
index 000000000000..cc65abeee24c
--- /dev/null
+++ b/lib/crc/arm64/crc64.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CRC64 using ARM64 PMULL instructions
+ */
+
+#include <linux/cpufeature.h>
+#include <asm/simd.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
+
+#define crc64_be_arch crc64_be_generic
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+	if (len >= 128 && cpu_have_named_feature(PMULL) &&
+	    likely(may_use_simd())) {
+		do {
+			size_t chunk = min_t(size_t, len & ~15, SZ_4K);
+
+			scoped_ksimd()
+				crc = crc64_nvme_arm64_c(crc, p, chunk);
+
+			p += chunk;
+			len -= chunk;
+		} while (len >= 128);
+	}
+	return crc64_nvme_generic(crc, p, len);
+}
-- 
cgit v1.2.3


From 5276ea17a23c829d4e4417569abff71a1c8342d9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Tue, 31 Mar 2026 17:44:31 -0700
Subject: lib/crc: arm64: Assume a little-endian kernel

Since support for big-endian arm64 kernels was removed, the CPU_LE()
macro now unconditionally emits the code it is passed, and the CPU_BE()
macro now unconditionally discards the code it is passed.

Simplify the assembly code in lib/crc/arm64/ accordingly.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20260401004431.151432-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crc/arm64/crc-t10dif-core.S | 56 ++++++++++++++++++++---------------------
 lib/crc/arm64/crc32-core.S      |  9 ++-----
 2 files changed, 30 insertions(+), 35 deletions(-)

(limited to 'lib')

diff --git a/lib/crc/arm64/crc-t10dif-core.S b/lib/crc/arm64/crc-t10dif-core.S
index 87dd6d46224d..71388466825b 100644
--- a/lib/crc/arm64/crc-t10dif-core.S
+++ b/lib/crc/arm64/crc-t10dif-core.S
@@ -181,13 +181,13 @@ SYM_FUNC_END(__pmull_p8_16x64)
 
 	pmull16x64_\p	fold_consts, \reg1, v8
 
-CPU_LE(	rev64		v11.16b, v11.16b		)
-CPU_LE(	rev64		v12.16b, v12.16b		)
+	rev64		v11.16b, v11.16b
+	rev64		v12.16b, v12.16b
 
 	pmull16x64_\p	fold_consts, \reg2, v9
 
-CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
-CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
+	ext		v11.16b, v11.16b, v11.16b, #8
+	ext		v12.16b, v12.16b, v12.16b, #8
 
 	eor		\reg1\().16b, \reg1\().16b, v8.16b
 	eor		\reg2\().16b, \reg2\().16b, v9.16b
@@ -220,22 +220,22 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	ldp		q4, q5, [buf, #0x40]
 	ldp		q6, q7, [buf, #0x60]
 	add		buf, buf, #0x80
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	rev64		v1.16b, v1.16b			)
-CPU_LE(	rev64		v2.16b, v2.16b			)
-CPU_LE(	rev64		v3.16b, v3.16b			)
-CPU_LE(	rev64		v4.16b, v4.16b			)
-CPU_LE(	rev64		v5.16b, v5.16b			)
-CPU_LE(	rev64		v6.16b, v6.16b			)
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
-CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
-CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
-CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
-CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
-CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	rev64		v0.16b, v0.16b
+	rev64		v1.16b, v1.16b
+	rev64		v2.16b, v2.16b
+	rev64		v3.16b, v3.16b
+	rev64		v4.16b, v4.16b
+	rev64		v5.16b, v5.16b
+	rev64		v6.16b, v6.16b
+	rev64		v7.16b, v7.16b
+	ext		v0.16b, v0.16b, v0.16b, #8
+	ext		v1.16b, v1.16b, v1.16b, #8
+	ext		v2.16b, v2.16b, v2.16b, #8
+	ext		v3.16b, v3.16b, v3.16b, #8
+	ext		v4.16b, v4.16b, v4.16b, #8
+	ext		v5.16b, v5.16b, v5.16b, #8
+	ext		v6.16b, v6.16b, v6.16b, #8
+	ext		v7.16b, v7.16b, v7.16b, #8
 
 	// XOR the first 16 data *bits* with the initial CRC value.
 	movi		v8.16b, #0
@@ -288,8 +288,8 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	pmull16x64_\p	fold_consts, v7, v8
 	eor		v7.16b, v7.16b, v8.16b
 	ldr		q0, [buf], #16
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+	rev64		v0.16b, v0.16b
+	ext		v0.16b, v0.16b, v0.16b, #8
 	eor		v7.16b, v7.16b, v0.16b
 	subs		len, len, #16
 	b.ge		.Lfold_16_bytes_loop_\@
@@ -310,8 +310,8 @@ CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	// v0 = last 16 original data bytes
 	add		buf, buf, len
 	ldr		q0, [buf, #-16]
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+	rev64		v0.16b, v0.16b
+	ext		v0.16b, v0.16b, v0.16b, #8
 
 	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
 	adr_l		x4, .Lbyteshift_table + 16
@@ -344,8 +344,8 @@ CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 
 	// Load the first 16 data bytes.
 	ldr		q7, [buf], #0x10
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	rev64		v7.16b, v7.16b
+	ext		v7.16b, v7.16b, v7.16b, #8
 
 	// XOR the first 16 data *bits* with the initial CRC value.
 	movi		v0.16b, #0
@@ -382,8 +382,8 @@ SYM_FUNC_START(crc_t10dif_pmull_p8)
 
 	crc_t10dif_pmull p8
 
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	rev64		v7.16b, v7.16b
+	ext		v7.16b, v7.16b, v7.16b, #8
 	str		q7, [x3]
 
 	frame_pop
diff --git a/lib/crc/arm64/crc32-core.S b/lib/crc/arm64/crc32-core.S
index 68825317460f..49d02cc485b3 100644
--- a/lib/crc/arm64/crc32-core.S
+++ b/lib/crc/arm64/crc32-core.S
@@ -29,24 +29,19 @@
 	.endm
 
 	.macro		hwordle, reg
-CPU_BE(	rev16		\reg, \reg	)
 	.endm
 
 	.macro		hwordbe, reg
-CPU_LE(	rev		\reg, \reg	)
+	rev		\reg, \reg
 	rbit		\reg, \reg
-CPU_BE(	lsr		\reg, \reg, #16	)
 	.endm
 
 	.macro		le, regs:vararg
-	.irp		r, \regs
-CPU_BE(	rev		\r, \r		)
-	.endr
 	.endm
 
 	.macro		be, regs:vararg
 	.irp		r, \regs
-CPU_LE(	rev		\r, \r		)
+	rev		\r, \r
 	.endr
 	.irp		r, \regs
 	rbit		\r, \r
-- 
cgit v1.2.3


From e0718ed60d60299840cfc2a408eb26042a20d186 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 30 Mar 2026 16:46:32 +0200
Subject: lib/crc: arm64: Drop unnecessary chunking logic from crc64

On arm64, kernel mode NEON executes with preemption enabled, so there is
no need to chunk the input by hand.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20260330144630.33026-8-ardb@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crc/arm64/crc64.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'lib')

diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h
index cc65abeee24c..60151ec3035a 100644
--- a/lib/crc/arm64/crc64.h
+++ b/lib/crc/arm64/crc64.h
@@ -16,15 +16,13 @@ static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
 {
 	if (len >= 128 && cpu_have_named_feature(PMULL) &&
 	    likely(may_use_simd())) {
-		do {
-			size_t chunk = min_t(size_t, len & ~15, SZ_4K);
+		size_t chunk = len & ~15;
 
-			scoped_ksimd()
-				crc = crc64_nvme_arm64_c(crc, p, chunk);
+		scoped_ksimd()
+			crc = crc64_nvme_arm64_c(crc, p, chunk);
 
-			p += chunk;
-			len -= chunk;
-		} while (len >= 128);
+		p += chunk;
+		len &= 15;
 	}
 	return crc64_nvme_generic(crc, p, len);
 }
-- 
cgit v1.2.3


From f956dc813144baf8bd2d77eec61b90bc00c10894 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 30 Mar 2026 16:46:33 +0200
Subject: lib/crc: arm64: Use existing macros for kernel-mode FPU cflags

Use the existing CC_FPU_CFLAGS and CC_NO_FPU_CFLAGS to pass the
appropriate compiler command line options for building kernel mode NEON
intrinsics code. This is tidier, and will make it easier to reuse the
code for 32-bit ARM.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20260330144630.33026-9-ardb@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crc/Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/crc/Makefile b/lib/crc/Makefile
index c9c35419b39c..ff213590e4e3 100644
--- a/lib/crc/Makefile
+++ b/lib/crc/Makefile
@@ -39,9 +39,8 @@ crc64-y := crc64-main.o
 ifeq ($(CONFIG_CRC64_ARCH),y)
 CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
 
-CFLAGS_REMOVE_arm64/crc64-neon-inner.o += -mgeneral-regs-only
-CFLAGS_arm64/crc64-neon-inner.o += -ffreestanding -march=armv8-a+crypto
-CFLAGS_arm64/crc64-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
+CFLAGS_REMOVE_arm64/crc64-neon-inner.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_arm64/crc64-neon-inner.o += $(CC_FLAGS_FPU) -march=armv8-a+crypto
 crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o
 
 crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
-- 
cgit v1.2.3


From 8fdef85d601db670e9c178314eedffe7bbb07e52 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 30 Mar 2026 16:46:35 +0200
Subject: lib/crc: arm64: Simplify intrinsics implementation

NEON intrinsics are useful because they remove the need for manual
register allocation, and the resulting code can be re-compiled and
optimized for different micro-architectures, and shared between arm64
and 32-bit ARM.

However, the strong typing of the vector variables can lead to
incomprehensible gibberish, as is the case with the new CRC64
implementation. To address this, let's repaint all variables as
uint64x2_t to minimize the number of vreinterpretq_xxx() calls, and to
be able to rely on the ^ operator for exclusive OR operations. This
makes the code much more concise and readable.

While at it, wrap the calls to vmull_p64() et al in order to have a more
consistent calling convention, and encapsulate any remaining
vreinterpret() calls that are still needed.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20260330144630.33026-11-ardb@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crc/arm64/crc64-neon-inner.c | 77 +++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 45 deletions(-)

(limited to 'lib')

diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c
index 881cdafadb37..28527e544ff6 100644
--- a/lib/crc/arm64/crc64-neon-inner.c
+++ b/lib/crc/arm64/crc64-neon-inner.c
@@ -8,9 +8,6 @@
 
 u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
 
-#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0))
-#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1))
-
 /* x^191 mod G, x^127 mod G */
 static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
 					0x21e9761e252621acULL };
@@ -18,61 +15,51 @@ static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
 static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL,
 				    0x34d926535897936aULL };
 
-u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b)
 {
-	uint64x2_t v0_u64 = { crc, 0 };
-	poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64);
-	poly64x2_t fold_consts =
-		vreinterpretq_p64_u64(vld1q_u64(fold_consts_val));
-	poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+	return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 0),
+						vgetq_lane_u64(b, 0)));
+}
 
-	v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0),
-					   vreinterpretq_u8_p64(v1)));
-	p += 16;
-	len -= 16;
+static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b)
+{
+	poly64x2_t l = vreinterpretq_p64_u64(a);
+	poly64x2_t m = vreinterpretq_p64_u64(b);
 
-	do {
-		v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+	return vreinterpretq_u64_p128(vmull_high_p64(l, m));
+}
 
-		poly128_t v2 = vmull_high_p64(fold_consts, v0);
-		poly128_t v0_128 =
-			vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0));
+static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b)
+{
+	return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 1),
+						vgetq_lane_u64(b, 0)));
+}
 
-		uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128),
-					 vreinterpretq_u8_p128(v2));
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+{
+	uint64x2_t fold_consts = vld1q_u64(fold_consts_val);
+	uint64x2_t v0 = { crc, 0 };
+	uint64x2_t zero = { };
 
-		x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1));
-		v0 = vreinterpretq_p64_u8(x0);
+	for (;;) {
+		v0 ^= vreinterpretq_u64_u8(vld1q_u8(p));
 
 		p += 16;
 		len -= 16;
-	} while (len >= 16);
-
-	/* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */
-	poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){ 0, 0 });
-	poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0));
+		if (len < 16)
+			break;
 
-	uint8x16_t ext_v0 =
-		vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8);
-	uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128));
+		v0 = pmull64(fold_consts, v0) ^ pmull64_high(fold_consts, v0);
+	}
 
-	v0 = vreinterpretq_p64_u8(x0);
+	/* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */
+	v0 = vextq_u64(v0, zero, 1) ^ pmull64_hi_lo(fold_consts, v0);
 
 	/* Final Barrett reduction */
-	poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val));
-
-	v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0));
-
-	poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128));
-	poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64));
-
-	x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128));
-
-	uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7),
-				     vreinterpretq_u8_p128(v1_128), 8);
+	uint64x2_t bconsts = vld1q_u64(bconsts_val);
+	uint64x2_t final = pmull64(bconsts, v0);
 
-	x0 = veorq_u8(x0, ext_v2);
+	v0 ^= vextq_u64(zero, final, 1) ^ pmull64_hi_lo(bconsts, final);
 
-	v0 = vreinterpretq_p64_u8(x0);
-	return vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1);
+	return vgetq_lane_u64(v0, 1);
 }
-- 
cgit v1.2.3