From 85c9f3a2b805eb96d899da7bcc38a16459aa3c16 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Mar 2026 19:35:55 -0800 Subject: lib/crc: tests: Make crc_kunit test only the enabled CRC variants Like commit 4478e8eeb871 ("lib/crypto: tests: Depend on library options rather than selecting them") did with the crypto library tests, make crc_kunit depend on the code it tests rather than selecting it. This follows the standard convention for KUnit and fixes an issue where enabling KUNIT_ALL_TESTS enabled non-test code. crc_kunit does differ from the crypto library tests in that it consolidates the tests for multiple CRC variants, with 5 kconfig options, into one KUnit suite. Since depending on *all* of these kconfig options would greatly restrict the ability to enable crc_kunit, instead just depend on *any* of these options. Update crc_kunit accordingly to test only the reachable code. Alternatively we could split crc_kunit into 5 test suites. But keeping it as one is simpler for now. Fixes: e47d9b1a76ed ("lib/crc_kunit.c: add KUnit test suite for CRC library functions") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20260306033557.250499-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crc/Kconfig | 7 +------ lib/crc/tests/crc_kunit.c | 28 ++++++++++++++++++++++------ 2 files changed, 23 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig index 70e7a6016de3..9ddfd1a29757 100644 --- a/lib/crc/Kconfig +++ b/lib/crc/Kconfig @@ -99,13 +99,8 @@ config CRC_OPTIMIZATIONS config CRC_KUNIT_TEST tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS - depends on KUNIT + depends on KUNIT && (CRC7 || CRC16 || CRC_T10DIF || CRC32 || CRC64) default KUNIT_ALL_TESTS - select CRC7 - select CRC16 - select CRC_T10DIF - select CRC32 - select CRC64 help Unit tests for the CRC library functions. diff --git a/lib/crc/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c index 9a450e25ac81..9428cd913625 100644 --- a/lib/crc/tests/crc_kunit.c +++ b/lib/crc/tests/crc_kunit.c @@ -268,8 +268,7 @@ crc_benchmark(struct kunit *test, } } -/* crc7_be */ - +#if IS_REACHABLE(CONFIG_CRC7) static u64 crc7_be_wrapper(u64 crc, const u8 *p, size_t len) { /* @@ -294,9 +293,9 @@ static void crc7_be_benchmark(struct kunit *test) { crc_benchmark(test, crc7_be_wrapper); } +#endif /* CONFIG_CRC7 */ -/* crc16 */ - +#if IS_REACHABLE(CONFIG_CRC16) static u64 crc16_wrapper(u64 crc, const u8 *p, size_t len) { return crc16(crc, p, len); @@ -318,9 +317,9 @@ static void crc16_benchmark(struct kunit *test) { crc_benchmark(test, crc16_wrapper); } +#endif /* CONFIG_CRC16 */ -/* crc_t10dif */ - +#if IS_REACHABLE(CONFIG_CRC_T10DIF) static u64 crc_t10dif_wrapper(u64 crc, const u8 *p, size_t len) { return crc_t10dif_update(crc, p, len); @@ -342,6 +341,9 @@ static void crc_t10dif_benchmark(struct kunit *test) { crc_benchmark(test, crc_t10dif_wrapper); } +#endif /* CONFIG_CRC_T10DIF */ + +#if IS_REACHABLE(CONFIG_CRC32) /* crc32_le */ @@ -414,6 +416,9 @@ static void crc32c_benchmark(struct kunit *test) { crc_benchmark(test, crc32c_wrapper); } +#endif /* CONFIG_CRC32 */ + +#if IS_REACHABLE(CONFIG_CRC64) /* crc64_be */ @@ -463,24 +468,35 @@ static void crc64_nvme_benchmark(struct kunit *test) { crc_benchmark(test, crc64_nvme_wrapper); } +#endif /* CONFIG_CRC64 */ static struct kunit_case crc_test_cases[] = { +#if IS_REACHABLE(CONFIG_CRC7) KUNIT_CASE(crc7_be_test), KUNIT_CASE(crc7_be_benchmark), +#endif +#if IS_REACHABLE(CONFIG_CRC16) KUNIT_CASE(crc16_test), KUNIT_CASE(crc16_benchmark), +#endif +#if IS_REACHABLE(CONFIG_CRC_T10DIF) KUNIT_CASE(crc_t10dif_test), KUNIT_CASE(crc_t10dif_benchmark), +#endif +#if IS_REACHABLE(CONFIG_CRC32) KUNIT_CASE(crc32_le_test), KUNIT_CASE(crc32_le_benchmark), KUNIT_CASE(crc32_be_test), KUNIT_CASE(crc32_be_benchmark), KUNIT_CASE(crc32c_test), KUNIT_CASE(crc32c_benchmark), +#endif +#if IS_REACHABLE(CONFIG_CRC64) KUNIT_CASE(crc64_be_test), KUNIT_CASE(crc64_be_benchmark), KUNIT_CASE(crc64_nvme_test), KUNIT_CASE(crc64_nvme_benchmark), +#endif {}, }; -- cgit v1.2.3 From cdf22aeaad8430905c3aa3b3d0f2686c65395c22 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Mar 2026 19:35:56 -0800 Subject: lib/crc: tests: Add CRC_ENABLE_ALL_FOR_KUNIT Now that crc_kunit uses the standard "depends on" pattern, enabling the full set of CRC tests is a bit difficult, mainly due to CRC7 being rarely used. Add a kconfig option to make it easier. It is visible only when KUNIT, so hopefully the extra prompt won't be too annoying. Link: https://lore.kernel.org/r/20260306033557.250499-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crc/Kconfig | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'lib') diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig index 9ddfd1a29757..cca228879bb5 100644 --- a/lib/crc/Kconfig +++ b/lib/crc/Kconfig @@ -107,6 +107,20 @@ config CRC_KUNIT_TEST This is intended to help people writing architecture-specific optimized versions. If unsure, say N. +config CRC_ENABLE_ALL_FOR_KUNIT + tristate "Enable all CRC functions for KUnit test" + depends on KUNIT + select CRC7 + select CRC16 + select CRC_T10DIF + select CRC32 + select CRC64 + help + Enable all CRC functions that have test code in CRC_KUNIT_TEST. + + Enable this only if you'd like the CRC KUnit test suite to test all + the CRC variants, even ones that wouldn't otherwise need to be built. + config CRC_BENCHMARK bool "Benchmark for the CRC functions" depends on CRC_KUNIT_TEST -- cgit v1.2.3 From c13cee2fc7f137dd25ed50c63eddcc578624f204 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 5 Mar 2026 19:35:57 -0800 Subject: lib/crc: tests: Add a .kunitconfig file Add a .kunitconfig file to the lib/crc/ directory so that the CRC library tests can be run more easily using kunit.py. Example with UML: tools/testing/kunit/kunit.py run --kunitconfig=lib/crc Example with QEMU: tools/testing/kunit/kunit.py run --kunitconfig=lib/crc --arch=arm64 --make_options LLVM=1 Link: https://lore.kernel.org/r/20260306033557.250499-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crc/.kunitconfig | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 lib/crc/.kunitconfig (limited to 'lib') diff --git a/lib/crc/.kunitconfig b/lib/crc/.kunitconfig new file mode 100644 index 000000000000..0a3671ba573f --- /dev/null +++ b/lib/crc/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_CRC_ENABLE_ALL_FOR_KUNIT=y +CONFIG_CRC_KUNIT_TEST=y -- cgit v1.2.3 From 6e4d63e8993c681e1cec7d564b4e018e21e658d0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 14 Mar 2026 10:57:44 -0700 Subject: lib/crc: arm64: Drop check for CONFIG_KERNEL_MODE_NEON CONFIG_KERNEL_MODE_NEON is always enabled on arm64, and it always has been since its introduction in 2013. Given that and the fact that the usefulness of kernel-mode NEON has only been increasing over time, checking for this option in arm64-specific code is unnecessary. Remove this check from lib/crc/ to simplify the code and prevent any future bugs where e.g. code gets disabled due to a typo in this logic. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20260314175744.30620-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig index cca228879bb5..52e216f39746 100644 --- a/lib/crc/Kconfig +++ b/lib/crc/Kconfig @@ -48,7 +48,7 @@ config CRC_T10DIF_ARCH bool depends on CRC_T10DIF && CRC_OPTIMIZATIONS default y if ARM && KERNEL_MODE_NEON - default y if ARM64 && KERNEL_MODE_NEON + default y if ARM64 default y if PPC64 && ALTIVEC default y if RISCV && RISCV_ISA_ZBC default y if X86 -- cgit v1.2.3 From 63432fd625372a0e79fb00a4009af204f4edc013 Mon Sep 17 00:00:00 2001 From: Demian Shulhan Date: Sun, 29 Mar 2026 07:43:38 +0000 Subject: lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation Implement an optimized CRC64 (NVMe) algorithm for ARM64 using NEON Polynomial Multiply Long (PMULL) instructions. The generic shift-and-XOR software implementation is slow, which creates a bottleneck in NVMe and other storage subsystems. The acceleration is implemented using C intrinsics () rather than raw assembly for better readability and maintainability. Key highlights of this implementation: - Uses 4KB chunking inside scoped_ksimd() to avoid preemption latency spikes on large buffers. - Pre-calculates and loads fold constants via vld1q_u64() to minimize register spilling. - Benchmarks show the break-even point against the generic implementation is around 128 bytes. The PMULL path is enabled only for len >= 128. Performance results (kunit crc_benchmark on Cortex-A72): - Generic (len=4096): ~268 MB/s - PMULL (len=4096): ~1556 MB/s (nearly 6x improvement) Signed-off-by: Demian Shulhan Link: https://lore.kernel.org/r/20260329074338.1053550-1-demyansh@gmail.com Signed-off-by: Eric Biggers --- lib/crc/Kconfig | 1 + lib/crc/Makefile | 8 ++++- lib/crc/arm64/crc64-neon-inner.c | 78 ++++++++++++++++++++++++++++++++++++++++ lib/crc/arm64/crc64.h | 30 ++++++++++++++++ 4 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 lib/crc/arm64/crc64-neon-inner.c create mode 100644 lib/crc/arm64/crc64.h (limited to 'lib') diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig index 52e216f39746..31038c8d111a 100644 --- a/lib/crc/Kconfig +++ b/lib/crc/Kconfig @@ -82,6 +82,7 @@ config CRC64 config CRC64_ARCH bool depends on CRC64 && CRC_OPTIMIZATIONS + default y if ARM64 default y if RISCV && RISCV_ISA_ZBC && 64BIT default y if X86_64 diff --git a/lib/crc/Makefile b/lib/crc/Makefile index 7543ad295ab6..c9c35419b39c 100644 --- a/lib/crc/Makefile +++ b/lib/crc/Makefile @@ -38,9 +38,15 @@ obj-$(CONFIG_CRC64) += crc64.o crc64-y := crc64-main.o ifeq ($(CONFIG_CRC64_ARCH),y) CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH) + +CFLAGS_REMOVE_arm64/crc64-neon-inner.o += -mgeneral-regs-only +CFLAGS_arm64/crc64-neon-inner.o += -ffreestanding -march=armv8-a+crypto +CFLAGS_arm64/crc64-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include) +crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o + crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o crc64-$(CONFIG_X86) += x86/crc64-pclmul.o -endif +endif # CONFIG_CRC64_ARCH obj-y += tests/ diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c new file mode 100644 index 000000000000..881cdafadb37 --- /dev/null +++ b/lib/crc/arm64/crc64-neon-inner.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC64 (NVMe) using ARM NEON C intrinsics + */ + +#include +#include + +u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len); + +#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0)) +#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1)) + +/* x^191 mod G, x^127 mod G */ +static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL, + 0x21e9761e252621acULL }; +/* floor(x^127 / G), (G - x^64) / x */ +static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL, + 0x34d926535897936aULL }; + +u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len) +{ + uint64x2_t v0_u64 = { crc, 0 }; + poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64); + poly64x2_t fold_consts = + vreinterpretq_p64_u64(vld1q_u64(fold_consts_val)); + poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p)); + + v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0), + vreinterpretq_u8_p64(v1))); + p += 16; + len -= 16; + + do { + v1 = vreinterpretq_p64_u8(vld1q_u8(p)); + + poly128_t v2 = vmull_high_p64(fold_consts, v0); + poly128_t v0_128 = + vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0)); + + uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128), + vreinterpretq_u8_p128(v2)); + + x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1)); + v0 = vreinterpretq_p64_u8(x0); + + p += 16; + len -= 16; + } while (len >= 16); + + /* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */ + poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){ 0, 0 }); + poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0)); + + uint8x16_t ext_v0 = + vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8); + uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128)); + + v0 = vreinterpretq_p64_u8(x0); + + /* Final Barrett reduction */ + poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val)); + + v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0)); + + poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128)); + poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64)); + + x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128)); + + uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7), + vreinterpretq_u8_p128(v1_128), 8); + + x0 = veorq_u8(x0, ext_v2); + + v0 = vreinterpretq_p64_u8(x0); + return vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1); +} diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h new file mode 100644 index 000000000000..cc65abeee24c --- /dev/null +++ b/lib/crc/arm64/crc64.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * CRC64 using ARM64 PMULL instructions + */ + +#include +#include +#include +#include + +u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len); + +#define crc64_be_arch crc64_be_generic + +static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) +{ + if (len >= 128 && cpu_have_named_feature(PMULL) && + likely(may_use_simd())) { + do { + size_t chunk = min_t(size_t, len & ~15, SZ_4K); + + scoped_ksimd() + crc = crc64_nvme_arm64_c(crc, p, chunk); + + p += chunk; + len -= chunk; + } while (len >= 128); + } + return crc64_nvme_generic(crc, p, len); +} -- cgit v1.2.3 From 5276ea17a23c829d4e4417569abff71a1c8342d9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Mar 2026 17:44:31 -0700 Subject: lib/crc: arm64: Assume a little-endian kernel Since support for big-endian arm64 kernels was removed, the CPU_LE() macro now unconditionally emits the code it is passed, and the CPU_BE() macro now unconditionally discards the code it is passed. Simplify the assembly code in lib/crc/arm64/ accordingly. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20260401004431.151432-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crc/arm64/crc-t10dif-core.S | 56 ++++++++++++++++++++--------------------- lib/crc/arm64/crc32-core.S | 9 ++----- 2 files changed, 30 insertions(+), 35 deletions(-) (limited to 'lib') diff --git a/lib/crc/arm64/crc-t10dif-core.S b/lib/crc/arm64/crc-t10dif-core.S index 87dd6d46224d..71388466825b 100644 --- a/lib/crc/arm64/crc-t10dif-core.S +++ b/lib/crc/arm64/crc-t10dif-core.S @@ -181,13 +181,13 @@ SYM_FUNC_END(__pmull_p8_16x64) pmull16x64_\p fold_consts, \reg1, v8 -CPU_LE( rev64 v11.16b, v11.16b ) -CPU_LE( rev64 v12.16b, v12.16b ) + rev64 v11.16b, v11.16b + rev64 v12.16b, v12.16b pmull16x64_\p fold_consts, \reg2, v9 -CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) -CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 eor \reg1\().16b, \reg1\().16b, v8.16b eor \reg2\().16b, \reg2\().16b, v9.16b @@ -220,22 +220,22 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) ldp q4, q5, [buf, #0x40] ldp q6, q7, [buf, #0x60] add buf, buf, #0x80 -CPU_LE( rev64 v0.16b, v0.16b ) -CPU_LE( rev64 v1.16b, v1.16b ) -CPU_LE( rev64 v2.16b, v2.16b ) -CPU_LE( rev64 v3.16b, v3.16b ) -CPU_LE( rev64 v4.16b, v4.16b ) -CPU_LE( rev64 v5.16b, v5.16b ) -CPU_LE( rev64 v6.16b, v6.16b ) -CPU_LE( rev64 v7.16b, v7.16b ) -CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) -CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) -CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) -CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) -CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) -CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) -CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) -CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + rev64 v0.16b, v0.16b + rev64 v1.16b, v1.16b + rev64 v2.16b, v2.16b + rev64 v3.16b, v3.16b + rev64 v4.16b, v4.16b + rev64 v5.16b, v5.16b + rev64 v6.16b, v6.16b + rev64 v7.16b, v7.16b + ext v0.16b, v0.16b, v0.16b, #8 + ext v1.16b, v1.16b, v1.16b, #8 + ext v2.16b, v2.16b, v2.16b, #8 + ext v3.16b, v3.16b, v3.16b, #8 + ext v4.16b, v4.16b, v4.16b, #8 + ext v5.16b, v5.16b, v5.16b, #8 + ext v6.16b, v6.16b, v6.16b, #8 + ext v7.16b, v7.16b, v7.16b, #8 // XOR the first 16 data *bits* with the initial CRC value. movi v8.16b, #0 @@ -288,8 +288,8 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) pmull16x64_\p fold_consts, v7, v8 eor v7.16b, v7.16b, v8.16b ldr q0, [buf], #16 -CPU_LE( rev64 v0.16b, v0.16b ) -CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) + rev64 v0.16b, v0.16b + ext v0.16b, v0.16b, v0.16b, #8 eor v7.16b, v7.16b, v0.16b subs len, len, #16 b.ge .Lfold_16_bytes_loop_\@ @@ -310,8 +310,8 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) // v0 = last 16 original data bytes add buf, buf, len ldr q0, [buf, #-16] -CPU_LE( rev64 v0.16b, v0.16b ) -CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) + rev64 v0.16b, v0.16b + ext v0.16b, v0.16b, v0.16b, #8 // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. adr_l x4, .Lbyteshift_table + 16 @@ -344,8 +344,8 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) // Load the first 16 data bytes. ldr q7, [buf], #0x10 -CPU_LE( rev64 v7.16b, v7.16b ) -CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + rev64 v7.16b, v7.16b + ext v7.16b, v7.16b, v7.16b, #8 // XOR the first 16 data *bits* with the initial CRC value. movi v0.16b, #0 @@ -382,8 +382,8 @@ SYM_FUNC_START(crc_t10dif_pmull_p8) crc_t10dif_pmull p8 -CPU_LE( rev64 v7.16b, v7.16b ) -CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + rev64 v7.16b, v7.16b + ext v7.16b, v7.16b, v7.16b, #8 str q7, [x3] frame_pop diff --git a/lib/crc/arm64/crc32-core.S b/lib/crc/arm64/crc32-core.S index 68825317460f..49d02cc485b3 100644 --- a/lib/crc/arm64/crc32-core.S +++ b/lib/crc/arm64/crc32-core.S @@ -29,24 +29,19 @@ .endm .macro hwordle, reg -CPU_BE( rev16 \reg, \reg ) .endm .macro hwordbe, reg -CPU_LE( rev \reg, \reg ) + rev \reg, \reg rbit \reg, \reg -CPU_BE( lsr \reg, \reg, #16 ) .endm .macro le, regs:vararg - .irp r, \regs -CPU_BE( rev \r, \r ) - .endr .endm .macro be, regs:vararg .irp r, \regs -CPU_LE( rev \r, \r ) + rev \r, \r .endr .irp r, \regs rbit \r, \r -- cgit v1.2.3 From e0718ed60d60299840cfc2a408eb26042a20d186 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Mar 2026 16:46:32 +0200 Subject: lib/crc: arm64: Drop unnecessary chunking logic from crc64 On arm64, kernel mode NEON executes with preemption enabled, so there is no need to chunk the input by hand. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20260330144630.33026-8-ardb@kernel.org Signed-off-by: Eric Biggers --- lib/crc/arm64/crc64.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h index cc65abeee24c..60151ec3035a 100644 --- a/lib/crc/arm64/crc64.h +++ b/lib/crc/arm64/crc64.h @@ -16,15 +16,13 @@ static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) { if (len >= 128 && cpu_have_named_feature(PMULL) && likely(may_use_simd())) { - do { - size_t chunk = min_t(size_t, len & ~15, SZ_4K); + size_t chunk = len & ~15; - scoped_ksimd() - crc = crc64_nvme_arm64_c(crc, p, chunk); + scoped_ksimd() + crc = crc64_nvme_arm64_c(crc, p, chunk); - p += chunk; - len -= chunk; - } while (len >= 128); + p += chunk; + len &= 15; } return crc64_nvme_generic(crc, p, len); } -- cgit v1.2.3 From f956dc813144baf8bd2d77eec61b90bc00c10894 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Mar 2026 16:46:33 +0200 Subject: lib/crc: arm64: Use existing macros for kernel-mode FPU cflags Use the existing CC_FPU_CFLAGS and CC_NO_FPU_CFLAGS to pass the appropriate compiler command line options for building kernel mode NEON intrinsics code. This is tidier, and will make it easier to reuse the code for 32-bit ARM. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20260330144630.33026-9-ardb@kernel.org Signed-off-by: Eric Biggers --- lib/crc/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/crc/Makefile b/lib/crc/Makefile index c9c35419b39c..ff213590e4e3 100644 --- a/lib/crc/Makefile +++ b/lib/crc/Makefile @@ -39,9 +39,8 @@ crc64-y := crc64-main.o ifeq ($(CONFIG_CRC64_ARCH),y) CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH) -CFLAGS_REMOVE_arm64/crc64-neon-inner.o += -mgeneral-regs-only -CFLAGS_arm64/crc64-neon-inner.o += -ffreestanding -march=armv8-a+crypto -CFLAGS_arm64/crc64-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include) +CFLAGS_REMOVE_arm64/crc64-neon-inner.o += $(CC_FLAGS_NO_FPU) +CFLAGS_arm64/crc64-neon-inner.o += $(CC_FLAGS_FPU) -march=armv8-a+crypto crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o -- cgit v1.2.3 From 8fdef85d601db670e9c178314eedffe7bbb07e52 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Mar 2026 16:46:35 +0200 Subject: lib/crc: arm64: Simplify intrinsics implementation NEON intrinsics are useful because they remove the need for manual register allocation, and the resulting code can be re-compiled and optimized for different micro-architectures, and shared between arm64 and 32-bit ARM. However, the strong typing of the vector variables can lead to incomprehensible gibberish, as is the case with the new CRC64 implementation. To address this, let's repaint all variables as uint64x2_t to minimize the number of vreinterpretq_xxx() calls, and to be able to rely on the ^ operator for exclusive OR operations. This makes the code much more concise and readable. While at it, wrap the calls to vmull_p64() et al in order to have a more consistent calling convention, and encapsulate any remaining vreinterpret() calls that are still needed. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20260330144630.33026-11-ardb@kernel.org Signed-off-by: Eric Biggers --- lib/crc/arm64/crc64-neon-inner.c | 77 +++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 45 deletions(-) (limited to 'lib') diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c index 881cdafadb37..28527e544ff6 100644 --- a/lib/crc/arm64/crc64-neon-inner.c +++ b/lib/crc/arm64/crc64-neon-inner.c @@ -8,9 +8,6 @@ u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len); -#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0)) -#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1)) - /* x^191 mod G, x^127 mod G */ static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL, 0x21e9761e252621acULL }; @@ -18,61 +15,51 @@ static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL, static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL, 0x34d926535897936aULL }; -u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len) +static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b) { - uint64x2_t v0_u64 = { crc, 0 }; - poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64); - poly64x2_t fold_consts = - vreinterpretq_p64_u64(vld1q_u64(fold_consts_val)); - poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p)); + return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 0), + vgetq_lane_u64(b, 0))); +} - v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0), - vreinterpretq_u8_p64(v1))); - p += 16; - len -= 16; +static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b) +{ + poly64x2_t l = vreinterpretq_p64_u64(a); + poly64x2_t m = vreinterpretq_p64_u64(b); - do { - v1 = vreinterpretq_p64_u8(vld1q_u8(p)); + return vreinterpretq_u64_p128(vmull_high_p64(l, m)); +} - poly128_t v2 = vmull_high_p64(fold_consts, v0); - poly128_t v0_128 = - vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0)); +static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b) +{ + return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 1), + vgetq_lane_u64(b, 0))); +} - uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128), - vreinterpretq_u8_p128(v2)); +u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len) +{ + uint64x2_t fold_consts = vld1q_u64(fold_consts_val); + uint64x2_t v0 = { crc, 0 }; + uint64x2_t zero = { }; - x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1)); - v0 = vreinterpretq_p64_u8(x0); + for (;;) { + v0 ^= vreinterpretq_u64_u8(vld1q_u8(p)); p += 16; len -= 16; - } while (len >= 16); - - /* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */ - poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){ 0, 0 }); - poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0)); + if (len < 16) + break; - uint8x16_t ext_v0 = - vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8); - uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128)); + v0 = pmull64(fold_consts, v0) ^ pmull64_high(fold_consts, v0); + } - v0 = vreinterpretq_p64_u8(x0); + /* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */ + v0 = vextq_u64(v0, zero, 1) ^ pmull64_hi_lo(fold_consts, v0); /* Final Barrett reduction */ - poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val)); - - v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0)); - - poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128)); - poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64)); - - x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128)); - - uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7), - vreinterpretq_u8_p128(v1_128), 8); + uint64x2_t bconsts = vld1q_u64(bconsts_val); + uint64x2_t final = pmull64(bconsts, v0); - x0 = veorq_u8(x0, ext_v2); + v0 ^= vextq_u64(zero, final, 1) ^ pmull64_hi_lo(bconsts, final); - v0 = vreinterpretq_p64_u8(x0); - return vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1); + return vgetq_lane_u64(v0, 1); } -- cgit v1.2.3